In [1]:
import pandas as pd
import numpy as np
from os import listdir
from math import radians, cos, sin, asin, sqrt
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold
import lightgbm as lgb
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### READ DATA

In [2]:
all_df = pd.read_pickle('all_df.pkl')

## Question2

We have data of San Francisco taxi cabs with 11 million rows. 538 taxi cab data is collected over a month period.


Question 2 : 

To build a predictor for taxi drivers, predicting the next place a passenger will hail a cab.

First I restricted my data to get out of outliers. I restricted dataset with San Francisco's Latitude and longitude information

In [3]:
all_df = all_df[(all_df['latitude']>37.1897)&(all_df['latitude']<38.2033)&(all_df['longitude']>-122.6445)&(all_df['longitude']<-121.5871)]

In [4]:
all_df.target_column.nunique()

580

In [5]:
#filter all the rows which don't have the target column (occupation status=1)
all_df['target_column'].replace('', np.nan, inplace=True)
X = all_df.dropna(subset=['target_column'])

In [6]:
X.shape

(6638519, 15)

In [7]:
X.occupation.value_counts()

0    6175163
1     463356
Name: occupation, dtype: int64

In [8]:
#Because of performance problems I selected only the first line when a taxi cab returns to 0
X_with_0_values = X[(X['occupation']==0)&(X['previous_occupation_status']==1)]
y = X_with_0_values['target_column']

Final Data Frame with shape :

In [9]:
X_with_0_values.shape

(463357, 15)

### Baseline1 for MultiClass Classification

Baseline 1 is calculated with the logic of a taxi cab can find a customer at the same location when its occupation status changed to 0.

In [10]:
baseline_df = X_with_0_values

In [11]:
#writing last_location as prediction
baseline_df['last_location'] = baseline_df.latitude.round(2).astype('str') + baseline_df.longitude.round(2).astype('str')

Accuracy refers to the proportion of the total number of predictions that were correct.

In [12]:
accuracy_score(baseline_df['target_column'],baseline_df['last_location'])

0.3236316706125083

### Baseline2  for MultiClass Classification

Baseline 2 is calculated with the logic of a taxi cab find a customer at the most frequent place.

In [13]:
baseline_df['target_column'].value_counts().max()/len(baseline_df['target_column'])

0.11561279963397553

# 1. Multiclass Classification Modelling

### LightGBM baseline model for multiclass classification

In [14]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

Don't want to use unneccessary columns, therefore I restricted with following features

In [15]:
X_with_0_values = X_with_0_values[['latitude', 'longitude','taxi_id','day_of_week', 'hour','holiday']]

In [16]:
#we need to convert categorical features to category for lightgbm algorithm
cat_col = X_with_0_values.select_dtypes('object').columns.tolist()

In [17]:
for feature in cat_col:
    X_with_0_values[feature] = pd.Series(X_with_0_values[feature], dtype="category")

In [18]:
# Reserved %80 of data to train and %20 to test
X_train, X_test, y_train, y_test = train_test_split(X_with_0_values, y, test_size=0.20, random_state=42)

In [22]:
d_train = lgb.Dataset(X_train, label=y_train)
params = {}
params['learning_rate'] = 0.01
params['boosting_type'] = 'gbdt'
params['objective'] = 'multiclass'
params['metric'] = 'multi_logloss'
params['max_depth'] = 10
params['num_class']=579
params['num_leaves'] = 50
params['bagging_fraction']= 0.3
params['num_threads']=4
params['max_bin']=50
params['random_state'] = 42


In [23]:
d_valid = lgb.Dataset(X_test, label=y_test)

evals_results = {}


In [24]:
#put early stopping if there will be no improvement for the algorithm
clf = lgb.train(params,d_train,100,valid_sets=[d_train, d_valid],valid_names=['train','valid'], evals_result=evals_results,
               early_stopping_rounds=10)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 639
[LightGBM] [Info] Number of data points in the train set: 370685, number of used features: 6
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from

[LightGBM] [Info] Start training from score -6.391777
[LightGBM] [Info] Start training from score -3.722694
[LightGBM] [Info] Start training from score -3.211913
[LightGBM] [Info] Start training from score -3.576821
[LightGBM] [Info] Start training from score -3.721801
[LightGBM] [Info] Start training from score -3.458588
[LightGBM] [Info] Start training from score -6.249428
[LightGBM] [Info] Start training from score -8.380457
[LightGBM] [Info] Start training from score -9.565011
[LightGBM] [Info] Start training from score -11.436814
[LightGBM] [Info] Start training from score -12.129961
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -12.129961
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] Start training from score -11.436814
[LightGBM] [Info] Start training from score -11.724496
[LightGBM] [Info] Start training from score -12.129961
[LightGBM] [Info] Start training from score -12.823108
[LightGBM] [Info] S







[1]	train's multi_logloss: 3.66133	valid's multi_logloss: 3.73385
Training until validation scores don't improve for 10 rounds
[2]	train's multi_logloss: 3.5666	valid's multi_logloss: 3.6538
[3]	train's multi_logloss: 3.5101	valid's multi_logloss: 3.61661
[4]	train's multi_logloss: 3.45769	valid's multi_logloss: 3.57546
[5]	train's multi_logloss: 3.40697	valid's multi_logloss: 3.53355
[6]	train's multi_logloss: 3.38188	valid's multi_logloss: 3.5165
[7]	train's multi_logloss: 3.32749	valid's multi_logloss: 3.47084
[8]	train's multi_logloss: 3.30389	valid's multi_logloss: 3.4539
[9]	train's multi_logloss: 3.26242	valid's multi_logloss: 3.41859
[10]	train's multi_logloss: 3.23467	valid's multi_logloss: 3.39797
[11]	train's multi_logloss: 3.20358	valid's multi_logloss: 3.37285
[12]	train's multi_logloss: 3.1812	valid's multi_logloss: 3.35658
[13]	train's multi_logloss: 3.15043	valid's multi_logloss: 3.33306
[14]	train's multi_logloss: 3.12809	valid's multi_logloss: 3.31763
[15]	train's mul

In [25]:
y_pred_1 = clf.predict(X_test)
y_pred_1 = [np.argmax(line) for line in y_pred_1]

In [26]:
print ('Accuracy Score :',accuracy_score(y_test, y_pred_1) )
print ('Report : ')
print (classification_report(y_test, y_pred_1) )

Accuracy Score : 0.33641229281767954
Report : 
              precision    recall  f1-score   support

          10       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         1
          33       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         0
          38       0.00      0.00      0.00         0
          40       0.00      0.00      0.00         1
          43       0.00      0.00      0.00         0
          46       0.00      0.00      0.00         1
          48       0.00      0.00      0.00         0
          49       0.00      0.00      0.00         1
          52       0.00      0.00      0.00         2
          55       0.00      0.00      0.00         1
          56       0.00      0.00 

# 2. Multi Target Regression Model

Second approach can be regression models. For that approach we need to separate target column as 2 different columns. target_latitude and target_longitude are created.

In [27]:
X_with_0_values = X[(X['occupation']==0)&(X['previous_occupation_status']==1)]

In [28]:
X_with_0_values['target_latitude'] = [float(x[:x.find("-")]) for x in X_with_0_values.target_column]
X_with_0_values['target_longitude'] = [float(x[x.find("-"):]) for x in X_with_0_values.target_column]

In [29]:
X_with_0_values.head()

Unnamed: 0,latitude,longitude,occupation,time,taxi_id,next_latitude,next_longitude,next_occupation_status,previous_occupation_status,next_time,distance_miles,target_column,day_of_week,hour,holiday,target_latitude,target_longitude
109,37.61,-122.39,0,2008-05-17 19:12:19,new_adkavy,37.61,-122.39,0.0,1.0,1211040785.0,0.27,37.61-122.39,5,19,0,37.61,-122.39
178,37.79,-122.43,0,2008-05-17 21:05:42,new_adkavy,37.79,-122.43,0.0,1.0,1211047602.0,0.15,37.79-122.44,5,21,0,37.79,-122.44
198,37.79,-122.4,0,2008-05-17 21:22:58,new_adkavy,37.79,-122.4,0.0,1.0,1211048639.0,0.07,37.79-122.4,5,21,0,37.79,-122.4
214,37.8,-122.43,0,2008-05-17 21:48:19,new_adkavy,37.8,-122.44,0.0,1.0,1211050164.0,0.12,37.8-122.44,5,21,0,37.8,-122.44
233,37.79,-122.41,0,2008-05-17 22:04:31,new_adkavy,37.79,-122.41,0.0,1.0,1211051095.0,0.0,37.79-122.4,5,22,0,37.79,-122.4


## Baseline calculation for latitude and longitude columns

In [30]:
print('baseline_rmse_latitude :', mean_squared_error(X_with_0_values['target_latitude'],X_with_0_values['latitude'],squared=False)) 

baseline_rmse_latitude : 0.04644259288677541


In [31]:
print('baseline_rmse_longitude :', mean_squared_error(X_with_0_values['target_longitude'],X_with_0_values['longitude'],squared=False)) 

baseline_rmse_longitude : 0.028691296837154136


## Baseline Calculation for distance point of view

In [32]:
#this time this function will be used to calculate distances between actual coordinates and prediction coordinates
def distance(lat1, lat2, lon1, lon2):
     
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
      
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))    
    r = 3959

    return(c * r)

In [33]:
#this time this function will be used to calculate distances between actual coordinates and prediction coordinates
def miles_calculate(df1,df2):
    miles_df = pd.concat([df1[['target_latitude','target_longitude']].reset_index(),
                         df2[['latitude','longitude']].reset_index()],axis=1).drop('index',axis=1)
    miles_df['distance_miles'] = [distance(x[0],x[2],x[1],x[3]) for x in miles_df.values]
    return miles_df.distance_miles.mean()

In [34]:
print('baseline in miles : ',miles_calculate(X_with_0_values,X_with_0_values))

baseline in miles :  1.5819636484579473


## 2.1 Modelling with ANN for multi target Regression Model

In [35]:
y = X_with_0_values[['target_latitude','target_longitude']]
X_with_0_values = X_with_0_values[['latitude', 'longitude','taxi_id','day_of_week', 'hour','holiday']]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_with_0_values, y, test_size=0.20, random_state=42)

In [37]:
labelencoder2 = LabelEncoder()
X_train['taxi_id'] = labelencoder2.fit_transform(X_train['taxi_id'])

In [38]:
sc_1=StandardScaler()
sc_1.fit(X_train)
X_train_scaled=pd.DataFrame(sc_1.fit_transform(X_train))

In [39]:
from keras.models import Sequential
from keras.layers import Dense
from numpy.random import seed
seed(42)
from tensorflow import set_random_seed
set_random_seed(2)

model = Sequential()
model.add(Dense(1000, input_dim=6, kernel_initializer='he_uniform', activation='relu'))
model.add(Dense(2))
model.compile(loss='mse', optimizer='adam')
model.fit(X_train_scaled, y_train, verbose=1, epochs=100,)

Using TensorFlow backend.




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 

Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7fccaf31d710>

In [40]:
X_test['taxi_id'] = labelencoder2.transform(X_test['taxi_id'])
X_test_scaled=pd.DataFrame(sc_1.transform(X_test))
predictions= pd.DataFrame(model.predict(X_test_scaled))
predictions.columns= ['latitude','longitude']

In [41]:
mean_squared_error(y_test['target_latitude'],predictions['latitude'],squared=False)

0.040113221773122494

In [42]:
mean_squared_error(y_test['target_longitude'],predictions['longitude'],squared=False)

0.057315856732215856

In [43]:
miles_calculate(y_test,predictions)

3.096218269838457

## 2.2 Modelling with KNeigborsRegressor for multi target Regression Model

In [44]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(X_train_scaled, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [45]:
predictions = pd.DataFrame(model.predict(X_test_scaled))
predictions.columns= ['latitude','longitude']


In [46]:
mean_squared_error(y_test['target_latitude'],predictions['latitude'],squared=False)

0.03746485807352835

In [47]:
mean_squared_error(y_test['target_longitude'],predictions['longitude'],squared=False)

0.01693711965180167

In [48]:
miles_calculate(y_test,predictions)

1.530106695193259

## 2.3 Modelling with Decision Tree Regressor for multi target Regression Model

In [49]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')

In [50]:
predictions = pd.DataFrame(model.predict(X_test_scaled))
predictions.columns= ['latitude','longitude']



In [51]:
mean_squared_error(y_test['target_latitude'],predictions['latitude'],squared=False)

0.046805921258303476

In [52]:
mean_squared_error(y_test['target_longitude'],predictions['longitude'],squared=False)

0.022206855458109877

In [53]:
miles_calculate(y_test,predictions)

1.8783097213967614

# 3. Single Target Regression Model

### 3.1 Baseline LGBM Model for Single Output

In [54]:
hyperparam_dict = {
    'max_depth' : 10,
    'n_estimators' : 100,
    'num_leaves' : 150,
    'learning_rate' : 0.01,
    'objective' : 'regression',
    'n_jobs' : -1,
    'subsample' : 0.7,
    'random_state' : 42
    
}

y_train_lat = y_train['target_latitude']
y_test_lat = y_test['target_latitude']

model = lgb.LGBMRegressor(**hyperparam_dict)
model.fit(X_train, y_train_lat, eval_set=[(X_test, y_test_lat)], early_stopping_rounds=10, verbose=0)

y_preds_lat = model.predict(X_test)
print('Lightgbm Baseline RMSE for Latitude :', mean_squared_error(y_test_lat, y_preds_lat, squared=False))

Lightgbm Baseline RMSE for Latitude : 0.03398591001725549


In [55]:
hyperparam_dict = {
    'max_depth' : 10,
    'n_estimators' : 100,
    'num_leaves' : 150,
    'learning_rate' : 0.01,
    'objective' : 'regression',
    'n_jobs' : -1,
    'subsample' : 0.7,
    'random_state' : 42
    
}

y_train_lon = y_train['target_longitude']
y_test_lon = y_test['target_longitude']

model = lgb.LGBMRegressor(**hyperparam_dict)
model.fit(X_train, y_train_lon, eval_set=[(X_test, y_test_lon)], early_stopping_rounds=10, verbose=0)

y_preds_lon = model.predict(X_test)
print('Lightgbm Baseline RMSE for Longitude:', mean_squared_error(y_test_lon, y_preds_lon, squared=False))

Lightgbm Baseline RMSE for Longitude: 0.016408281448564724


### 3.2 Parameter seach for LGBM Latitude model

In [56]:
params_dict = {'max_depth' : [2,5,10],
              'n_estimators' : [100,200,500],
              'learning_rate' : [0.001, 0.005, 0.01, 0.1],
              'num_leaves' : [50,100],
              'subsample' : [0.7]
              }

model = lgb.LGBMRegressor(n_jobs=-1, objective='regression',random_state=42)

grid_search = GridSearchCV(model, param_grid=params_dict, cv=5, verbose=0, scoring="neg_root_mean_squared_error")
grid_search.fit(X_train,y_train_lat)

GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective='regression', random_state=42,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.001, 0.005, 0.01, 0.1],
                         'max_depth': [2, 5, 10],
                         'n_estimators': [100, 200, 500

In [57]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 200,
 'num_leaves': 50,
 'subsample': 0.7}

### Final Score on test set for latitude

In [58]:
hyperparam_dict = {
    'max_depth' : 5,
    'n_estimators' : 200,
    'num_leaves' : 50,
    'learning_rate' : 0.1,
    'objective' : 'regression',
    'n_jobs' : -1,
    'subsample' : 0.7,
    'random_state' : 42
    
}

model = lgb.LGBMRegressor(**hyperparam_dict)

model.fit(X_train, y_train_lat, eval_set=[(X_test, y_test_lat)], early_stopping_rounds=10, verbose=0)

y_preds_lat = model.predict(X_test)

print('Lightgbm Final Rmse for Latitude:', mean_squared_error(y_test_lat, y_preds_lat, squared=False))

Lightgbm Final Rmse for Latitude: 0.03220917938297847


### 3.3 Parameter seach for LGBM Longitude model

In [59]:
params_dict = {'max_depth' : [2,5,10],
              'n_estimators' : [100,200,500],
              'learning_rate' : [0.001, 0.005, 0.01, 0.1],
              'num_leaves' : [50,100],
              'subsample' : [0.7]
              }

model = lgb.LGBMRegressor(n_jobs=-1, objective='regression',random_state=42)

grid_search = GridSearchCV(model, param_grid=params_dict, cv=5, verbose=0, scoring="neg_root_mean_squared_error")
grid_search.fit(X_train,y_train_lon)

GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective='regression', random_state=42,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.001, 0.005, 0.01, 0.1],
                         'max_depth': [2, 5, 10],
                         'n_estimators': [100, 200, 500

In [60]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 100,
 'num_leaves': 100,
 'subsample': 0.7}

###  LIGHT GBM Final Score on test set for longitude

In [62]:
hyperparam_dict = {
    'max_depth' : 10,
    'n_estimators' : 200,
    'num_leaves' : 100,
    'learning_rate' : 0.1,
    'objective' : 'regression',
    'n_jobs' : -1,
    'subsample' : 0.7,
    'random_state' : 42
    
}

model = lgb.LGBMRegressor(**hyperparam_dict)

model.fit(X_train, y_train_lon, eval_set=[(X_test, y_test_lon)], early_stopping_rounds=10, verbose=0)

y_preds_lon = model.predict(X_test)

print('Lightgbm Final Rmse for Longitude:', mean_squared_error(y_test_lon, y_preds_lon, squared=False))

Lightgbm Final Rmse for Longitude: 0.015375707149568244


In [63]:
predictions = pd.concat([pd.DataFrame(y_preds_lat),pd.DataFrame(y_preds_lon)],axis=1)
predictions.columns= ['latitude','longitude']

In [64]:
miles_calculate(y_test,predictions)

1.3290029958933223

## Hyperparamater tuning for KNN Regressor

As I found out most premising algorithm as KNN, I want to tune and see if it beats the score of LightGBM model

In [65]:
params_dict = {'n_neighbors' : [20,50,100,150,200],
              'p' : [1,2],
              }

model = KNeighborsRegressor(weights='uniform',n_jobs=-1)

grid_search = GridSearchCV(model,param_grid=params_dict, cv=5, verbose=0, scoring="neg_root_mean_squared_error")
grid_search.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=-1,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [20, 50, 100, 150, 200], 'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_root_mean_squared_error', verbose=0)

In [66]:
grid_search.best_params_

{'n_neighbors': 100, 'p': 1}

In [67]:
model = KNeighborsRegressor(n_neighbors=100,p=1)
model.fit(X_train_scaled, y_train)
predictions = pd.DataFrame(model.predict(X_test_scaled))
predictions.columns= ['latitude','longitude']

print('Rmse for Latitude:',mean_squared_error(y_test['target_latitude'],predictions['latitude'],squared=False))
print('Rmse for Longitude:',mean_squared_error(y_test['target_longitude'],predictions['longitude'],squared=False))
print('distance: ',miles_calculate(y_test,predictions))

Rmse for Latitude: 0.03297677924536423
Rmse for Longitude: 0.015548029682749025
distance:  1.3638516618731569


# 5. FINAL MODEL

In [68]:
hyperparam_dict = {
    'max_depth' : 5,
    'n_estimators' : 200,
    'num_leaves' : 50,
    'learning_rate' : 0.1,
    'objective' : 'regression',
    'n_jobs' : -1,
    'subsample' : 0.7,
    'random_state' : 42
    
}

model = lgb.LGBMRegressor(**hyperparam_dict)

model.fit(X_train, y_train_lat, eval_set=[(X_test, y_test_lat)], early_stopping_rounds=10, verbose=0)

y_preds_lat = model.predict(X_test)

print('Lightgbm Final Rmse for Latitude:', mean_squared_error(y_test_lat, y_preds_lat, squared=False))

Lightgbm Final Rmse for Latitude: 0.03220917938297847


In [69]:
hyperparam_dict = {
    'max_depth' : 10,
    'n_estimators' : 200,
    'num_leaves' : 100,
    'learning_rate' : 0.1,
    'objective' : 'regression',
    'n_jobs' : -1,
    'subsample' : 0.7,
    'random_state' : 42
    
}

model = lgb.LGBMRegressor(**hyperparam_dict)

model.fit(X_train, y_train_lon, eval_set=[(X_test, y_test_lon)], early_stopping_rounds=10, verbose=0)

y_preds_lon = model.predict(X_test)

print('Lightgbm Final Rmse for Longitude:', mean_squared_error(y_test_lon, y_preds_lon, squared=False))

Lightgbm Final Rmse for Longitude: 0.015375707149568244


In [70]:
predictions = pd.concat([pd.DataFrame(y_preds_lat),pd.DataFrame(y_preds_lon)],axis=1)
predictions.columns= ['latitude','longitude']
miles_calculate(y_test,predictions)

1.3290029958933223