### EDA

import necessary packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'missingno'

load the data

In [None]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')
riders = pd.read_csv('data/Riders.csv')

get a feel for the dataset, take a look at columns and data values

In [None]:
train.head().T

In [None]:
train.shape

In [None]:
train.drop_duplicates(inplace=True)
train.shape


No duplicate rows

In [None]:
print(test.shape)
test.head().T

In [None]:
train.dtypes

In [None]:
riders.head().T

In [None]:
train['Vehicle Type'].unique()

Vehicle type is bike for all rows, drop this column. Also drop columns not in
test data

In [None]:
train.drop(columns=['Vehicle Type','Arrival at Destination - Day of Month',
                        'Arrival at Destination - Weekday (Mo = 1)',
                        'Arrival at Destination - Time'], inplace=True)

checking correlation between the different day of month, weekday and time 
features with target variable

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(25, 5), sharey=True)
fig.suptitle('Day of month variable comparison')


sns.barplot(ax=axes[0], data=train,x='Placement - Day of Month', y='Time from Pickup to Arrival')
axes[0].set_title('placement DOM')

sns.barplot(ax=axes[1], data=train, x='Confirmation - Day of Month', y='Time from Pickup to Arrival')
axes[1].set_title('confirmation DOM')

sns.barplot(ax=axes[2], data=train, x='Arrival at Pickup - Day of Month', y='Time from Pickup to Arrival')
axes[2].set_title('arrival at pickup DOM')

sns.barplot(ax=axes[3], data=train, x='Pickup - Day of Month', y='Time from Pickup to Arrival')
axes[3].set_title('pickup DOM')

    


In [None]:
fig, axes = plt.subplots(1, 4, figsize=(25, 5), sharey=True)
fig.suptitle('weekday variable comparison')


sns.barplot(ax=axes[0], data=train,x='Placement - Weekday (Mo = 1)', y='Time from Pickup to Arrival')
axes[0].set_title('placement weekday')

sns.barplot(ax=axes[1], data=train, x='Confirmation - Weekday (Mo = 1)', y='Time from Pickup to Arrival')
axes[1].set_title('confirmation weekday')

sns.barplot(ax=axes[2], data=train, x='Arrival at Pickup - Weekday (Mo = 1)', y='Time from Pickup to Arrival')
axes[2].set_title('arrival at pickup weekday')

sns.barplot(ax=axes[3], data=train, x='Pickup - Weekday (Mo = 1)', y='Time from Pickup to Arrival')
axes[3].set_title('pickup weekday')

After seeing these graphs it seems that they have a very high correlation with
each other, and therefore would not add new information to aid in predicting
the duration of the ride. So we can just keep the data for 'pickup' and drop
the columns for placement,confirmation and arrival at pickup. Order number
and user ID can also be dropped since they don't correlate with the time an
order would take to drop

In [None]:
train.drop(columns=['Placement - Day of Month', 'Confirmation - Day of Month',
'Arrival at Pickup - Day of Month', 'Placement - Weekday (Mo = 1)',
'Confirmation - Weekday (Mo = 1)', 'Arrival at Pickup - Weekday (Mo = 1)',
'Order No', 'User Id'], 
inplace=True)

next we can check for missing values across columns and see if any columns
need to be dropped or have their values imputed

In [None]:
msno.bar(train)

we can see that precipitation has very few values and should be dropped but
temperature only has approximately 20% of it's values missing so we can impute
the missing values

In [None]:
train.drop(columns=['Precipitation in millimeters'], 
inplace=True)

def imputeMissingVals(df: pd.DataFrame, colName: str):
    """imputes the missing values for a single column and drops the original

    Args:
        df (pd.DataFrame): dataFrame
        colName (str): name of column to be dropped
    
    Returns:
        None
    """    
    imp = IterativeImputer(random_state=0)
    new_col = imp.fit_transform(df[colName].to_frame())
    df[colName] = new_col
    return

imputeMissingVals(train, 'Temperature')

Now we can observe that our dataset has no missing data

In [None]:
msno.bar(train)

next we convert the 'time' columns to pd.DateTime objects so we can work
with the data and manipulate it easier. We also rounded to the nearest hour 
since this is a categorical variable and this makes the number of categories
reasonable

In [None]:
def conv_to_time(time_cols, df):
    for time_col in time_cols:
            df[time_col] = (pd.to_datetime(df[time_col]))
            df[time_col] = df[time_col].dt.round('1H')

time_cols = ['Placement - Time', 'Confirmation - Time', 
'Arrival at Pickup - Time', 'Pickup - Time']
conv_to_time(time_cols, train)


Next we can plot the data to see any correlations

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(25, 5), sharey=True)
fig.suptitle('time variable comparison')


sns.barplot(ax=axes[0], data=train,x='Placement - Time', y='Time from Pickup to Arrival')
axes[0].set_title('placement time')

sns.barplot(ax=axes[1], data=train, x='Confirmation - Time', y='Time from Pickup to Arrival')
axes[1].set_title('confirmation time')

sns.barplot(ax=axes[2], data=train, x='Arrival at Pickup - Time', y='Time from Pickup to Arrival')
axes[2].set_title('arrival at pickup time')

sns.barplot(ax=axes[3], data=train, x='Pickup - Time', y='Time from Pickup to Arrival')
axes[3].set_title('pickup time')

since the target variable we're trying to predict is the time from pickup to 
arrival it makes the most sense to keep just the time data for pickup, also
since these are categorical variables, each of these columns would add around
18 features to our model, and since the training dataset isn't that big, 
overfitting is a concern.

In [None]:
train.drop(columns=['Placement - Time','Confirmation - Time',
'Arrival at Pickup - Time'], 
inplace=True)

The features we are left with now are:

In [None]:
train.columns

Except for the time column, we can see the correlations of the different 
features on the heatmap below

In [None]:
sns.heatmap(train[['Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)',
'Distance (KM)', 'Temperature', 'Pickup Lat', 'Pickup Long',
'Destination Lat', 'Destination Long','Time from Pickup to Arrival']].corr(), square=True)

It seems that the only cell with a high correlation to the output is distance.
So engineering some features which would make it easier for the model to learn
correlations is important here

The first feature we chose to add is a time of day feature which is a 
categorical variable that places an order into one of four categories:
morning, afternoon, evening, night based on the pickup time

The second feature just categorizes an order as a weekend or weekday delivery

In [None]:
def FENG_weekend(df: pd.DataFrame):
    """creates a column specifiying if pickup was on a weekend or not

    Args:
        df (pd.DataFrame): dataframe
    
    Returns:
        df
    """     
    df['weekend'] = df['Pickup - Weekday (Mo = 1)'] >= 6
    return df

def FENG_TODcol(df: pd.DataFrame):
    """adds a column which specifies the time of day of pick up from one of four
    categories: morning, afternoon, evening, night

    Args:
        df (pd.DataFrame): dataframe

    Returns:
        df

    """       
    conditions = [
   (df['Pickup - Time'] >= pd.to_datetime('05:00:00')) & (df['Pickup - Time'] <= pd.to_datetime('11:59:00')),
   (df['Pickup - Time'] >= pd.to_datetime('12:00:00')) & (df['Pickup - Time'] <= pd.to_datetime('17:59:00')),
   (df['Pickup - Time'] >= pd.to_datetime('18:00:00')) & (df['Pickup - Time'] <= pd.to_datetime('19:59:00')),
   (df['Pickup - Time'] >= pd.to_datetime('20:00:00')) | (df['Pickup - Time'] <= pd.to_datetime('4:59:00')),
   ]

    values = ['morning','afternoon','evening','night']

    df['TOD'] = np.select(conditions, values)
    return df

train = FENG_TODcol(train)
train = FENG_weekend(train)

train.columns


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(25, 5), sharey=True)
fig.suptitle('raw vs engineered feature comparison: TOD')
sns.stripplot(ax=axes[0], data=train,x='Pickup - Time', y='Time from Pickup to Arrival')
axes[0].set_title('pickup time')

sns.stripplot(ax=axes[1], data=train, x='TOD', y='Time from Pickup to Arrival')
axes[1].set_title('time of day')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(25, 5), sharey=True)
fig.suptitle('raw vs engineered feature comparison: weekday')
sns.stripplot(ax=axes[0], data=train,x='Pickup - Weekday (Mo = 1)', y='Time from Pickup to Arrival')
axes[0].set_title('pickup time')

sns.stripplot(ax=axes[1], data=train, x='weekend', y='Time from Pickup to Arrival')
axes[1].set_title('time of day')

now we can merge in our rider data and see how it correlates with our target 
variable

In [None]:
merged_train = pd.merge(train,riders,on="Rider Id")
merged_train.columns


In [None]:
sns.heatmap(merged_train[['No_Of_Orders', 'Age',
       'Average_Rating', 'No_of_Ratings', 'Time from Pickup to Arrival']].corr(), square=True)