In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
import pickle
from sklearn.model_selection import RandomizedSearchCV
import os

In [3]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.ensemble import AdaBoostRegressor

In [4]:
from utils import join_data, hervy_dist, minutes_convert, train_test

In [5]:
dst_folder = os.path.join(os.getcwd(),"Processed_Data")

In [6]:
df = join_data(dst_folder)

In [7]:
df.columns

Index(['Unnamed: 0', 'Latitude', 'Longitude', 'Occupancy', 'Date', 'Hour',
       'Day', 'Latitude_1', 'Longitude_1', 'Occupancy_1', 'Date_1', 'Hour_1',
       'Day_1', 'Cab_Name'],
      dtype='object')

In [8]:
X_train_o, X_test_o, y_train_o, y_test_o, X_train, X_test, y_train, y_test, scaler = train_test(df, 0.1)

In [9]:
y_test_o.to_csv("Y_Test.csv")

In [10]:
X_test_o.to_csv("Test_Data.csv")

In [9]:
import pickle
pickle.dump(scaler, open('scaler.pkl','wb'))

In [14]:
model_dict=dict()

In [9]:
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)
y_pred = model_linear.predict(X_test)
filename = 'finalized_linear.sav'
pickle.dump(model_linear, open(filename, 'wb'))

model_linear = pickle.load(open(filename, 'rb'))

model_dict["Linear_Regression"]=hervy_dist(y_pred, y_test)["Distance"].mean()

In [10]:
model_knn = KNeighborsRegressor(algorithm='ball_tree',n_neighbors=100)
scores = cross_val_score(model_knn, X_train, y_train, cv=5)
model_knn.fit(X_train, y_train)

filename = 'finalized_model_knn.sav'
pickle.dump(model_knn, open(filename, 'wb'))

model_knn = pickle.load(open(filename, 'rb'))



y_pred = model_knn.predict(X_test)
model_dict["KNN"]=hervy_dist(y_pred, y_test)["Distance"].mean()

In [11]:
model_dict

{'Linear_Regression': 2240.509840687619, 'KNN': 2630.1952848200426}

In [12]:
parameters={"splitter":["best","random"],
            "max_depth" : [1,3,],
           "min_samples_leaf":[1,2,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,],
           "max_leaf_nodes":[None,10,20,30] }
model_desc_tree=GridSearchCV(DecisionTreeRegressor(),param_grid=parameters,scoring='neg_mean_absolute_error',cv=3,verbose=3)
model_desc_tree.fit(X_train, y_train)

filename = 'finalized_model_decision_tree.sav'
pickle.dump(model_desc_tree, open(filename, 'wb'))

model_desc_tree = pickle.load(open(filename, 'rb'))

y_pred = model_desc_tree.predict(X_test)

model_dict["Decision_tree"]=hervy_dist(y_pred, y_test)["Distance"].mean()

Fitting 3 folds for each of 128 candidates, totalling 384 fits
[CV 1/3] END max_depth=1, max_leaf_nodes=None, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=None, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=None, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.1s
[CV 1/3] END max_depth=1, max_leaf_nodes=None, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=None, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=None, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.016 total time=   0.0s
[CV 1/3] END max_depth=1, max_leaf_nodes=None, min_samples_le

[CV 2/3] END max_depth=1, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 1/3] END max_depth=1, max_leaf_nodes=10, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=10, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=10, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.0s
[CV 1/3] END max_depth=1, max_leaf_nodes=10, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=10, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time

[CV 3/3] END max_depth=1, max_leaf_nodes=20, min_samples_leaf=2, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.016 total time=   0.0s
[CV 1/3] END max_depth=1, max_leaf_nodes=20, min_samples_leaf=2, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=20, min_samples_leaf=2, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=20, min_samples_leaf=2, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 1/3] END max_depth=1, max_leaf_nodes=20, min_samples_leaf=9, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=20, min_samples_leaf=9, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=20, min_samples_leaf=9, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time= 

[CV 1/3] END max_depth=1, max_leaf_nodes=30, min_samples_leaf=9, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.016 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=30, min_samples_leaf=9, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.016 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=30, min_samples_leaf=9, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.016 total time=   0.0s
[CV 1/3] END max_depth=1, max_leaf_nodes=30, min_samples_leaf=9, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 2/3] END max_depth=1, max_leaf_nodes=30, min_samples_leaf=9, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 3/3] END max_depth=1, max_leaf_nodes=30, min_samples_leaf=9, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.016 total time=   0.0s
[CV 1/3] END max_depth=1, max_leaf_nodes=30, min_samples_leaf=10, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.016 total time=

[CV 3/3] END max_depth=3, max_leaf_nodes=None, min_samples_leaf=10, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time=   0.0s
[CV 1/3] END max_depth=3, max_leaf_nodes=None, min_samples_leaf=10, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.015 total time=   0.1s
[CV 2/3] END max_depth=3, max_leaf_nodes=None, min_samples_leaf=10, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.015 total time=   0.1s
[CV 3/3] END max_depth=3, max_leaf_nodes=None, min_samples_leaf=10, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.015 total time=   0.1s
[CV 1/3] END max_depth=3, max_leaf_nodes=None, min_samples_leaf=10, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 2/3] END max_depth=3, max_leaf_nodes=None, min_samples_leaf=10, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 3/3] END max_depth=3, max_leaf_nodes=None, min_samples_leaf=10, min_weight_fraction_leaf=0.2, splitter=random;, sc

[CV 3/3] END max_depth=3, max_leaf_nodes=20, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.015 total time=   0.2s
[CV 1/3] END max_depth=3, max_leaf_nodes=20, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.016 total time=   0.0s
[CV 2/3] END max_depth=3, max_leaf_nodes=20, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time=   0.0s
[CV 3/3] END max_depth=3, max_leaf_nodes=20, min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time=   0.0s
[CV 1/3] END max_depth=3, max_leaf_nodes=20, min_samples_leaf=1, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.015 total time=   0.1s
[CV 2/3] END max_depth=3, max_leaf_nodes=20, min_samples_leaf=1, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.015 total time=   0.1s
[CV 3/3] END max_depth=3, max_leaf_nodes=20, min_samples_leaf=1, min_weight_fraction_leaf=0.2, splitter=best;, score=-0.015 total time= 

[CV 3/3] END max_depth=3, max_leaf_nodes=30, min_samples_leaf=1, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.018 total time=   0.0s
[CV 1/3] END max_depth=3, max_leaf_nodes=30, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.015 total time=   0.1s
[CV 2/3] END max_depth=3, max_leaf_nodes=30, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.015 total time=   0.2s
[CV 3/3] END max_depth=3, max_leaf_nodes=30, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=best;, score=-0.015 total time=   0.2s
[CV 1/3] END max_depth=3, max_leaf_nodes=30, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time=   0.0s
[CV 2/3] END max_depth=3, max_leaf_nodes=30, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.017 total time=   0.0s
[CV 3/3] END max_depth=3, max_leaf_nodes=30, min_samples_leaf=2, min_weight_fraction_leaf=0.1, splitter=random;, score=-0.018 total time

In [13]:
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 100, num = 2)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 30, num = 2)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
              }


rf = RandomForestRegressor()

model_rf = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=3, random_state=42, n_jobs = 1)

model_rf.fit(X_train, y_train)


filename = 'finalized_model_random_forest.sav'
pickle.dump(model_rf, open(filename, 'wb'))


model_rf = pickle.load(open(filename, 'rb'))


y_pred = model_rf.predict(X_test)


model_dict["Random_Forest"] = hervy_dist(y_pred, y_test)["Distance"].mean()



Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END max_depth=10, max_features=auto, min_samples_split=2, n_estimators=20;, score=0.349 total time=   8.4s
[CV 2/2] END max_depth=10, max_features=auto, min_samples_split=2, n_estimators=20;, score=0.353 total time=   8.3s
[CV 1/2] END max_depth=10, max_features=auto, min_samples_split=2, n_estimators=100;, score=0.352 total time=  42.4s
[CV 2/2] END max_depth=10, max_features=auto, min_samples_split=2, n_estimators=100;, score=0.354 total time=  41.4s
[CV 1/2] END max_depth=10, max_features=auto, min_samples_split=5, n_estimators=20;, score=0.349 total time=   8.5s
[CV 2/2] END max_depth=10, max_features=auto, min_samples_split=5, n_estimators=20;, score=0.352 total time=   8.5s
[CV 1/2] END max_depth=10, max_features=auto, min_samples_split=5, n_estimators=100;, score=0.351 total time=  42.6s
[CV 2/2] END max_depth=10, max_features=auto, min_samples_split=5, n_estimators=100;, score=0.354 total time=  38.8s
[CV 1/2]

In [18]:
model_xgb = model = XGBRegressor(random_state=0, n_estimators=100)
model_xgb.fit(X_train, y_train)

filename = 'finalized_model_XGB.sav'
pickle.dump(model_xgb, open(filename, 'wb'))


model_xgb = pickle.load(open(filename, 'rb'))

y_pred = model_xgb.predict(X_test)



model_dict["XGB"]=hervy_dist(y_pred, y_test)["Distance"].mean()

In [22]:
model_dict

{'Linear_Regression': 2240.509840687619,
 'KNN': 2630.1952848200426,
 'Decision_tree': 2336.103194457695,
 'Random_Forest': 1897.9040233303715,
 'XGB': 1890.3004602743213,
 'XGBF': 2355.207637596242}

In [23]:
model_xgbf = model = XGBRFRegressor(random_state=0, n_estimators=100)
model_xgbf.fit(X_train, y_train)

filename = 'finalized_model_XGBF.sav'
pickle.dump(model_xgbf, open(filename, 'wb'))


model_xgbf = pickle.load(open(filename, 'rb'))

y_pred = model_xgbf.predict(X_test)



model_dict["XGBF"]=hervy_dist(y_pred, y_test)["Distance"].mean()

In [24]:
df_models =pd.DataFrame.from_dict(model_dict,orient='index').reset_index()
df_models.columns = ["Model", "Score"]
df_models.sort_values(by= "Score")

Unnamed: 0,Model,Score
4,XGB,1890.30046
3,Random_Forest,1897.904023
0,Linear_Regression,2240.509841
2,Decision_tree,2336.103194
5,XGBF,2355.207638
1,KNN,2630.195285


In [25]:
df_models["File_name"]=['finalized_linear.sav',
                        'finalized_model_knn.sav',
                        'finalized_model_decision_tree.sav',
                        'finalized_model_random_forest.sav',
                        'finalized_model_XGB.sav',
                       'finalized_model_random_forest.sav']

In [27]:
df_models.to_csv("Classical_ML_Models_test.csv")

In [28]:
df_models

Unnamed: 0,Model,Score,File_name
0,Linear_Regression,2240.509841,finalized_linear.sav
1,KNN,2630.195285,finalized_model_knn.sav
2,Decision_tree,2336.103194,finalized_model_decision_tree.sav
3,Random_Forest,1897.904023,finalized_model_random_forest.sav
4,XGB,1890.30046,finalized_model_XGB.sav
5,XGBF,2355.207638,finalized_model_random_forest.sav
