In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
%cd /content/drive/MyDrive/Causal_Inference_Model/

/content/drive/MyDrive/Causal_Inference_Model


In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
import pandas as pd
import sys, os

In [44]:
sys.path.append(os.path.abspath('scripts/'))

In [62]:
from causal_pipeline import EDAPipeline
from filehundle import LoadData

In [47]:
causal_pipeline = EDAPipeline()


Initialize Loaded Data

In [29]:
trip_df = pd.read_csv("data/nb.csv")

In [31]:
trip_df.info()
# df_trip

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536020 entries, 0 to 536019
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Trip ID           536020 non-null  int64 
 1   Trip Origin       536020 non-null  object
 2   Trip Destination  536020 non-null  object
 3   Trip Start Time   534369 non-null  object
 4   Trip End Time     536019 non-null  object
dtypes: int64(1), object(4)
memory usage: 20.4+ MB


In [33]:
driver_df = pd.read_csv("data/driver_locations_during_request.csv")

In [34]:
driver_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1557740 entries, 0 to 1557739
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   id             1557740 non-null  int64  
 1   order_id       1557740 non-null  int64  
 2   driver_id      1557740 non-null  int64  
 3   driver_action  1557740 non-null  object 
 4   lat            1557740 non-null  float64
 5   lng            1557740 non-null  float64
 6   created_at     0 non-null        float64
 7   updated_at     0 non-null        float64
dtypes: float64(4), int64(3), object(1)
memory usage: 95.1+ MB


Drop rows with null value for column: Trip Start Time

In [35]:
trip_df['Trip Start Time'].isna().sum()

1651

In [36]:
trip_df = trip_df[trip_df['Trip Start Time'].notna()]

In [37]:
trip_df.shape

(534369, 5)

Check if Trip Start Time is weekend or not

In [40]:
trip_df['is_weekend'] = trip_df['Trip Start Time'].apply(lambda x: causal_pipeline.isWeekend(x))

In [41]:
trip_df['is_weekend'].value_counts()

0    427182
1    107187
Name: is_weekend, dtype: int64

Weekends distribution

In [63]:
import matplotlib.pyplot as plt
import seaborn as sns

In [67]:
def plot_count(df:pd.DataFrame, column:str) -> None:
    plt.figure(figsize=(12, 7))
    sns.countplot(data=df, x=column)
    plt.title(f'Distribution of {column}', size=20, fontweight='bold')
    plt.show()
plot_count(trip_df, 'is_weekend')

Check for holidays

In [68]:
trip_df['is_holiday'] = trip_df['Trip Start Time'].apply(lambda x: causal_pipeline.isHoliday(x))


In [69]:
trip_df['is_holiday'].value_counts()

0    525289
1      9080
Name: is_holiday, dtype: int64

Holiday distribution

In [70]:
plot_count(trip_df, 'is_holiday')

Check for weather status

In [71]:
from Weather_EDA import WeatherAPIConnector 

In [72]:
weather_checker = WeatherAPIConnector()

Calculate Distance

In [73]:
start_coordinates = trip_df['Trip Origin'].tolist()
end_coordinates = trip_df['Trip Destination'].tolist()

In [74]:
trip_df['distance'] = causal_pipeline.calculate_distances(start_coordinates, end_coordinates)


In [75]:
trip_df['distance'].describe()

count    534369.000000
mean         12.099933
std          11.025363
min           0.000000
25%           5.560139
50%          10.510693
75%          16.993854
max         680.080847
Name: distance, dtype: float64

In [76]:
trip_df['distance'].value_counts()

2.467037     132
0.000000     121
8.040819     102
17.456076     95
22.388326     73
            ... 
8.679657       1
8.800412       1
15.795467      1
18.150252      1
22.657280      1
Name: distance, Length: 466412, dtype: int64

#### Get Speed

In [77]:
start_date_times = trip_df['Trip Start Time'].tolist()
end_date_times = trip_df['Trip End Time'].tolist()
distance_list = trip_df['distance'].tolist()


In [78]:
trip_df['speed'] = causal_pipeline.calculate_speeds(start_date_times, end_date_times, distance_list)

In [79]:
trip_df['speed'].describe()

count    534369.000000
mean         26.570059
std         244.737329
min           0.000000
25%           6.876641
50%          11.137486
75%          16.815429
max       66498.868759
Name: speed, dtype: float64


Handle speed and distance outliers

I will try to understand the outliers in distance column using speed, which means if the speed is too high which is the result of trying to go long distances with in few minutes, for example for a motor bike you can not go 672 km in 30 minutes, this means either the location is wrong or the distance is an outlier


In [80]:
df_distance_outliers = trip_df[trip_df['distance']>50]

In [81]:
df_distance_outliers[['Trip ID', 'Trip Start Time', 'Trip End Time', 'distance', 'speed']]

Unnamed: 0,Trip ID,Trip Start Time,Trip End Time,distance,speed
891,393572,2021-07-01 11:48:09,2021-07-01 11:52:32,52.840859,723.296929
1122,394008,2021-07-01 12:18:42,2021-07-01 14:41:39,56.343718,23.648990
2422,396558,2021-07-01 15:58:05,2021-07-01 18:08:09,56.292206,25.967701
3428,398525,2021-07-02 07:48:44,2021-07-02 08:14:49,672.026066,1545.874656
4474,400634,2021-07-02 11:44:18,2021-07-02 11:48:56,667.559260,8644.652285
...,...,...,...,...,...
519764,1578624,2021-12-22 17:04:16,2021-12-22 22:17:41,52.399714,10.031320
521502,1586390,2021-12-23 12:32:35,2021-12-23 15:27:38,50.999013,17.480382
523354,1596600,2021-12-24 09:35:47,2021-12-24 15:13:53,71.334946,12.659263
524740,1606367,2021-12-25 09:11:38,2021-12-25 12:10:19,52.364448,17.583436


In [82]:
df = driver_df[driver_df['driver_action']=='accepted']

In [84]:
top10 = df['driver_id'].value_counts().head(10)
top10

243296    69
245611    63
227856    60
244107    60
245649    59
243648    58
171149    58
243500    58
245587    57
243892    57
Name: driver_id, dtype: int64

In [86]:
top10.plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x7fea1f4e83d0>

In [87]:
df_trip_na_trip_start = trip_df[trip_df['Trip Start Time'].isna()]
vals = df_trip_na_trip_start['Trip ID'].unique().tolist()


In [91]:
trip_df.to_csv("data/processed_trip.csv")