### Import packages/libraries and read data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from haversine import haversine
%matplotlib inline 

In [143]:
train = pd.read_csv("sendylogisticsproject/Train.csv")
test = pd.read_csv("sendylogisticsproject/Test.csv")
riders = pd.read_csv("sendylogisticsproject/Riders.csv")

In [3]:
train.shape, test.shape, riders.shape

((21201, 29), (7068, 25), (960, 5))

### Preprocessing

In [4]:
def calc_coordDist(df):
    df['Coordinate Distance (KM)'] = df.apply(lambda row: round(haversine((row['Destination Lat'], row['Destination Long']), (row['Pickup Lat'], row['Pickup Long'])),2), axis=1)
    return df

In [5]:
def get_hours(df):
    df['Pickup - Time'] = pd.to_datetime(df['Pickup - Time'])
    df['Pickup_Hour'] = df['Pickup - Time'].dt.hour
    return df

Calculate average speed in m/s and drop entries with outlying speeds

In [6]:
def calc_avgSpeed(df):
    df['Average Speed']=(df['Distance (KM)']*1000)/df['Time from Pickup to Arrival']
    df=df[(df['Average Speed']>3) & (df['Average Speed']<19)]
    return df

In [7]:
def join_riderData(df):
    combined=df.copy()
    combined=combined.set_index('Rider Id').join(riders.set_index('Rider Id'))
    return combined

In [8]:
def calc_riderSpeed(df):
    df['Rider Median Speed']=df.groupby(by='Rider Id')['Average Speed'].median()
    df=df.reset_index()
    return df

In [9]:
def calc_weeklyOrders(df):
    df['Rider Average Weekly Orders'] = round(df['No_Of_Orders']*7/df['Age'])
    return df

Replace average rating for new riders (few orders or few reviews) to balance column

In [10]:
def balance_newriderRating(df):
    newrider_reviews=df['No_of_Ratings'].quantile(0.25)
    newrider_orders=df['No_Of_Orders'].quantile(0.25)
    newrider_rating=df[(df['No_of_Ratings']<newrider_reviews)&(df['No_Of_Orders']<newrider_orders)]['Average_Rating'].quantile(0.25)
    
    newrider_rating=df[(df['Average_Rating']<newrider_rating)&(df['No_of_Ratings']<newrider_reviews)&(df['No_Of_Orders']<newrider_orders)]['Average_Rating'].quantile(0.25)
    
    df.loc[(df['Average_Rating']<newrider_rating)&(df['No_of_Ratings']<newrider_reviews)&(df['No_Of_Orders']<newrider_orders) , 'Average_Rating'] = newrider_rating 
    return df

### Model input 

In [11]:
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge 
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.tree import ExtraTreeRegressor, DecisionTreeRegressor

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder 
toOHE=OneHotEncoder

from sklearn import metrics
from sklearn.model_selection import cross_validate

In [12]:
TRAIN=calc_coordDist(train)
TRAIN=get_hours(TRAIN)
TRAIN=calc_avgSpeed(TRAIN)
TRAIN=join_riderData(TRAIN)
TRAIN=calc_riderSpeed(TRAIN)
TRAIN=calc_weeklyOrders(TRAIN)
TRAIN=balance_newriderRating(TRAIN)

In [13]:
def get_features(df):
    X=df.select_dtypes(np.number).fillna(0)
    X=X.drop(columns=['Precipitation in millimeters', 'No_Of_Orders', 'Age', 'No_of_Ratings'])
    X.loc[X['Temperature']==0, 'Temperature']=X['Temperature'].mean()

    num_data=X[['Pickup - Day of Month', 'Temperature', 'Pickup_Hour',  
            'Distance (KM)', 'Coordinate Distance (KM)',
            'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long', 
            'Rider Median Speed', 'Rider Average Weekly Orders', 'Average_Rating']]
    
    cat_data=X['Pickup - Weekday (Mo = 1)']
    
    X=pd.concat([num_data, cat_data], axis=1)
    
    return X

### Ensemble

In [14]:
lrmodel = LinearRegression()

In [98]:
treemodel = DecisionTreeRegressor(min_impurity_split=0.15, max_depth=6, random_state=42)

In [102]:
xrtmodel = BaggingRegressor(n_estimators=100, base_estimator=ExtraTreeRegressor(min_impurity_split= 0.15, max_depth= 7), random_state=42)

In [17]:
abmodel = AdaBoostRegressor(n_estimators=100, random_state=42)

In [18]:
lassomodel = Lasso(random_state=42)

In [19]:
ridgemodel = Ridge(random_state=42)

In [20]:
enmodel = ElasticNet(random_state=42)

### Cross-validation and model selection

In [23]:
TRAIN.columns

Index(['Rider Id', 'Order No', 'User Id', 'Vehicle Type', 'Platform Type',
       'Personal or Business', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
       'Confirmation - Time', 'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Arrival at Pickup - Time',
       'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time',
       'Arrival at Destination - Day of Month',
       'Arrival at Destination - Weekday (Mo = 1)',
       'Arrival at Destination - Time', 'Distance (KM)', 'Temperature',
       'Precipitation in millimeters', 'Pickup Lat', 'Pickup Long',
       'Destination Lat', 'Destination Long', 'Time from Pickup to Arrival',
       'Coordinate Distance (KM)', 'Pickup_Hour', 'Average Speed',
       'No_Of_Orders', 'Age', 'Average_Rating', 'No_of_Ratings',
       'Rider Median Speed', 'Rider Average Weekly Orders'],
     

In [119]:
X=get_features(TRAIN)
y=TRAIN['Time from Pickup to Arrival']
    
RESULTS = {}
scoring_functions = {"mse": "neg_mean_squared_error", "mae": "neg_mean_absolute_error", "r2": "r2"}
def evaluate_model(estimator):
    cv_results = cross_validate(estimator, X, y, scoring=scoring_functions, n_jobs=-1, cv=3, return_train_score=True)
    return pd.DataFrame(cv_results).abs().mean().to_dict()

In [132]:
def disp_evaluation(RESULTS):
    display=pd.DataFrame.from_dict(RESULTS).T
    display['test_mse']=display['test_mse']**0.5
    display['train_mse']=display['train_mse']**0.5
    display=display.rename(columns={"test_mse":"test_rmse", "train_mse":"train_rmse"})
    return display.sort_values(by="test_rmse")

In [120]:
RESULTS["linear_reg"] = evaluate_model(lrmodel)
RESULTS["tree"]=evaluate_model(treemodel)
RESULTS["bagging_random_tree"] = evaluate_model(xrtmodel)
RESULTS["adaboost"] = evaluate_model(abmodel)
RESULTS["lasso"] = evaluate_model(lassomodel)
RESULTS["ridge"] = evaluate_model(ridgemodel)
RESULTS["elastic net"] = evaluate_model(enmodel)

In [133]:
disp_evaluation(RESULTS)

Unnamed: 0,fit_time,score_time,test_rmse,train_rmse,test_mae,train_mae,test_r2,train_r2
bagging_random_tree,0.887552,0.279768,526.60408,509.88456,377.137402,367.569992,0.618298,0.642455
ridge,0.008989,0.00464,527.927816,526.792615,374.427394,373.605721,0.616501,0.618406
linear_reg,0.014296,0.00698,527.959408,526.791265,374.470937,373.627143,0.616455,0.618408
lasso,0.018285,0.006648,527.980768,527.376771,374.091992,373.61736,0.616427,0.617558
elastic net,0.014961,0.003989,531.386564,531.000957,376.946883,376.580973,0.611395,0.612274
tree,0.053854,0.005319,535.240358,509.265461,378.823796,362.979688,0.60559,0.643314
adaboost,1.152313,0.100231,658.214301,653.342897,535.847462,534.132117,0.401311,0.413939


In [139]:
estimator_stacking = StackingRegressor(
    regressors=[
        treemodel,
        ridgemodel
    ], 
    meta_regressor=lrmodel)
RESULTS["stacking"]  = evaluate_model(estimator_stacking)

In [140]:
disp_evaluation(RESULTS)

Unnamed: 0,fit_time,score_time,test_rmse,train_rmse,test_mae,train_mae,test_r2,train_r2
stacking,0.139848,0.010638,526.067179,506.486494,371.655428,359.52436,0.618991,0.647214
bagging_random_tree,0.887552,0.279768,526.60408,509.88456,377.137402,367.569992,0.618298,0.642455
ridge,0.008989,0.00464,527.927816,526.792615,374.427394,373.605721,0.616501,0.618406
linear_reg,0.014296,0.00698,527.959408,526.791265,374.470937,373.627143,0.616455,0.618408
lasso,0.018285,0.006648,527.980768,527.376771,374.091992,373.61736,0.616427,0.617558
elastic net,0.014961,0.003989,531.386564,531.000957,376.946883,376.580973,0.611395,0.612274
tree,0.053854,0.005319,535.240358,509.265461,378.823796,362.979688,0.60559,0.643314
adaboost,1.152313,0.100231,658.214301,653.342897,535.847462,534.132117,0.401311,0.413939


### Tuning

In [29]:
search_parameters_space = {'max_depth':range(1, 10), 'min_impurity_split':list(np.arange(0, 0.5, 0.01))}

In [30]:
random_search = RandomizedSearchCV(estimator=treemodel, param_distributions=search_parameters_space, scoring="neg_mean_absolute_error", n_jobs=-1, n_iter=50)

In [31]:
random_search.fit(X, y)
random_search.best_score_, random_search.best_params_



(-378.8236309857109, {'min_impurity_split': 0.15, 'max_depth': 6})

In [32]:
random_search = RandomizedSearchCV(estimator=ExtraTreeRegressor(), param_distributions=search_parameters_space, scoring="neg_mean_absolute_error", n_jobs=-1, n_iter=50)

In [33]:
random_search.fit(X, y)
random_search.best_score_, random_search.best_params_



(-387.2110754703425, {'min_impurity_split': 0.15, 'max_depth': 7})

### Training on Train.csv

In [141]:
X=get_features(TRAIN)
y=TRAIN['Time from Pickup to Arrival']

In [142]:
estimator_stacking.fit(X, y)



StackingRegressor(meta_regressor=LinearRegression(copy_X=True,
                                                  fit_intercept=True,
                                                  n_jobs=None,
                                                  normalize=False),
                  refit=True,
                  regressors=[DecisionTreeRegressor(criterion='mse',
                                                    max_depth=6,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=0.15,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                             

### Predicting on Test.csv

In [144]:
riders=riders.sort_values(by='Rider Id').reset_index()
riders=riders.drop(columns='index')

In [145]:
riders['Rider Median Speed']=TRAIN['Rider Median Speed']
riders['Rider Average Weekly Orders']=TRAIN['Rider Average Weekly Orders']

In [146]:
TEST=calc_coordDist(test)
TEST=get_hours(TEST)
TEST['index']=TEST.index
TEST=join_riderData(TEST)
TEST=balance_newriderRating(TEST)
TEST=TEST.sort_values(by='index').reset_index()

In [147]:
X_test=get_features(TEST)
results=pd.DataFrame(TEST['Order No'])
results['Time from Pickup to Arrival']=estimator_stacking.predict(X_test)
results['Time from Pickup to Arrival']=results['Time from Pickup to Arrival'].apply(lambda x: int(x))
result=results.rename(columns={'Order No':'Order_No'})

In [149]:
results.to_csv("sendylogisticsproject/Results.csv", index=False)