In [1]:
import pandas as pd
import random

In [2]:
sample_frac = 0.01

selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
dtypes = {
    'fare_amount': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'passenger_count': 'float32'
}

def skip_row(row_idx):
    if row_idx == 0:
        return False
    return random.random() > sample_frac

random.seed(42)
df = pd.read_csv(r"C:\Users\naman\Downloads\train_data.csv", 
                 usecols=selected_cols, 
                 dtype=dtypes, 
                 parse_dates=['pickup_datetime'], 
                 skiprows=skip_row)

In [3]:
Train=df
Train

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
1,8.0,2013-01-17 17:22:00+00:00,0.000000,0.000000,0.000000,0.000000,2.0
2,8.9,2011-06-15 18:07:00+00:00,-73.996330,40.753223,-73.978897,40.766963,3.0
3,6.9,2009-12-14 12:33:00+00:00,-73.982430,40.745747,-73.982430,40.745747,1.0
4,7.0,2013-11-06 11:26:54+00:00,-73.959061,40.781059,-73.962059,40.768604,1.0
...,...,...,...,...,...,...,...
552445,45.0,2014-02-06 23:59:45+00:00,-73.973587,40.747669,-73.999916,40.602893,1.0
552446,22.5,2015-01-05 15:29:08+00:00,-73.935928,40.799656,-73.985710,40.726952,2.0
552447,4.5,2013-02-17 22:27:00+00:00,-73.992531,40.748619,-73.998436,40.740142,1.0
552448,14.5,2013-01-27 12:41:00+00:00,-74.012115,40.706635,-73.988724,40.756217,1.0


In [4]:
Train.dtypes

fare_amount                      float32
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float32
pickup_latitude                  float32
dropoff_longitude                float32
dropoff_latitude                 float64
passenger_count                  float32
dtype: object

In [5]:
Train.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [6]:
Test=pd.read_csv(r"C:\Users\naman\Downloads\test.csv", dtype=dtypes, parse_dates=['pickup_datetime'])
Test

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0


In [7]:
def Date_Parts(df, col):
    df[col + '_year'] = df[col].dt.year
    df[col + '_month'] = df[col].dt.month
    df[col + '_day'] = df[col].dt.day
    df[col + '_weekday'] = df[col].dt.weekday
    df[col + '_hour'] = df[col].dt.hour

In [8]:
Date_Parts(Train,"pickup_datetime")
Date_Parts(Test,"pickup_datetime")


In [9]:
import numpy as np

def np_dist(lon1, lat1, lon2, lat2):
    
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [10]:
def trip_dist(df):
    df['trip_distance'] = np_dist(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])

In [11]:
trip_dist(Train)
trip_dist(Test)

In [12]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
    lon, lat = landmark_lonlat
    df[landmark_name + '_drop_distance'] = np_dist(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])


In [13]:
for a_df in [Train,  Test]:
    for name, lonlat in [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('met', met_lonlat), ('wtc', wtc_lonlat)]:
        add_landmark_dropoff_distance(a_df, name, lonlat)

In [14]:
def OutlierRemoval(df):
    return df[(df['fare_amount'] >= 1.) & 
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) & 
              (df['pickup_longitude'] <= -72) & 
              (df['dropoff_longitude'] >= -75) & 
              (df['dropoff_longitude'] <= -72) & 
              (df['pickup_latitude'] >= 40) & 
              (df['pickup_latitude'] <= 42) & 
              (df['dropoff_latitude'] >=40) & 
              (df['dropoff_latitude'] <= 42) & 
              (df['passenger_count'] >= 1) & 
              (df['passenger_count'] <= 6)]
Train=OutlierRemoval(Train)


In [15]:
Train

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
0,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0,2014,12,6,5,20,0.398747,21.183686,9.188351,17.989393,2.989041,5.411371
2,8.9,2011-06-15 18:07:00+00:00,-73.996330,40.753223,-73.978897,40.766963,3.0,2011,6,15,2,18,2.117609,21.935055,8.896755,18.585592,1.911950,6.580879
3,6.9,2009-12-14 12:33:00+00:00,-73.982430,40.745747,-73.982430,40.745747,1.0,2009,12,14,0,12,0.000088,20.763112,9.761111,17.342683,4.074938,4.349602
4,7.0,2013-11-06 11:26:54+00:00,-73.959061,40.781059,-73.962059,40.768604,1.0,2013,11,6,2,11,1.406799,20.982755,7.467520,19.933104,1.203565,7.413407
5,15.5,2014-12-08 01:00:16+00:00,-73.957672,40.717888,-73.942581,40.686397,1.0,2014,12,8,0,1,3.722912,14.742722,11.597390,19.545065,10.479769,6.375179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552445,45.0,2014-02-06 23:59:45+00:00,-73.973587,40.747669,-73.999916,40.602893,1.0,2014,2,6,3,23,16.240561,19.189930,22.056014,17.587218,19.856826,12.220225
552446,22.5,2015-01-05 15:29:08+00:00,-73.935928,40.799656,-73.985710,40.726952,2.0,2015,1,5,0,15,9.101115,19.916294,10.919756,16.438819,6.128600,2.587407
552447,4.5,2013-02-17 22:27:00+00:00,-73.992531,40.748619,-73.998436,40.740142,1.0,2013,2,17,6,22,1.064933,21.571045,11.242337,15.861784,5.275134,3.209331
552448,14.5,2013-01-27 12:41:00+00:00,-74.012115,40.706635,-73.988724,40.756217,1.0,2013,1,27,6,12,5.851563,21.862064,9.925104,17.313776,3.354145,5.164619


In [16]:
Test

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1.0,2015,1,27,1,13,2.321720,20.574949,9.760151,17.346891,4.239318,4.218800
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0,2015,1,27,1,13,2.423889,21.550981,11.315998,15.789650,5.382910,3.098180
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0,2011,10,8,5,11,0.618009,20.594007,9.526878,17.576952,3.946866,4.514444
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0,2012,12,1,5,21,1.959910,21.689160,10.195201,16.969532,3.844222,4.636742
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0,2012,12,1,5,21,5.383931,21.113870,10.295947,16.808312,4.434000,3.967057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6.0,2015,5,10,6,12,2.123367,21.507015,6.880891,21.014938,0.645636,8.809757
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6.0,2015,1,12,0,17,3.268916,21.462229,7.254930,20.464521,0.420324,8.229248
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6.0,2015,4,19,6,20,19.171439,1.169152,16.084441,32.772347,20.734211,19.933692
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6.0,2015,1,31,5,1,8.338278,22.402436,6.138515,23.410822,3.200773,11.556237


In [17]:
x_Train=Train.drop(["fare_amount","pickup_datetime"],axis=1)
y_Train=Train["fare_amount"]
x_Train.shape,y_Train.shape

((538840, 16), (538840,))

In [18]:
df_Test = Test.drop(['pickup_datetime','key'] , axis=1)

# RandomForestRegressor

In [19]:
from sklearn.ensemble import RandomForestRegressor
model2 = RandomForestRegressor(max_depth=10, n_jobs=-1, random_state=42, n_estimators=50)
model2.fit(x_Train,y_Train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [21]:
from sklearn.metrics import mean_squared_error
Train_predictions = model2.predict(x_Train)
Train_rmse = mean_squared_error(Train_predictions,y_Train, squared=False)

RMSE using RandomForestRegressor

In [22]:
Train_rmse

3.667831633509176

# XGBRegressor

In [23]:
from xgboost import XGBRegressor
model3 = XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror')
model3.fit(x_Train,y_Train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:squarederror',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [24]:
Train_predictions = model3.predict(x_Train)
Train_rmse = mean_squared_error(Train_predictions,y_Train, squared=False)

RMSE using XGBRegressor

In [25]:
Train_rmse

3.9425333

In [26]:
prediction_xg = model3.predict(df_Test)
prediction_rf = model2.predict(df_Test)

submission_xg =pd.DataFrame( { 'key' : list(Test['key']), 'fare_amount':list(prediction_xg) })
submission_rf =pd.DataFrame( { 'key' : list(Test['key']), 'fare_amount':list(prediction_rf) })

submission_xg.to_csvr"C:\Users\naman\OneDrive\Desktop\varshu\IIT Hyderabad\SEM 1\FOML\ASSIGNMENT4\submission_xg.csv")
submission_rf.to_csv(r"C:\Users\naman\OneDrive\Desktop\varshu\IIT Hyderabad\SEM 1\FOML\ASSIGNMENT4\submission_rf.csv")

In [27]:
def remove_col(f):
    df = pd.read_csv(f)
    # If you know the name of the column skip this
    first_column = df.columns[0]
    # Delete first
    df = df.drop([first_column], axis=1)
    df.to_csv(f, index=False)

In [28]:
remove_col(r"C:\Users\naman\OneDrive\Desktop\varshu\IIT Hyderabad\SEM 1\FOML\ASSIGNMENT4\submission_xg.csv")
remove_col(r"C:\Users\naman\OneDrive\Desktop\varshu\IIT Hyderabad\SEM 1\FOML\ASSIGNMENT4\submission_rf.csv")

# Ridge Regressor

In [34]:
from sklearn.linear_model import Ridge
model1 = Ridge(random_state=42)
model1.fit(x_Train,y_Train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

In [35]:
Train_predictions = model1.predict(x_Train)
Train_rmse = mean_squared_error(Train_predictions,y_Train, squared=False)

RMSE using RidgeRegressor

In [36]:
Train_rmse

5.08322990748438

In [37]:
prediction_rr = model1.predict(df_Test)
submission_rr =pd.DataFrame( { 'key' : list(Test['key']), 'fare_amount':list(prediction_rr) })
submission_rr.to_csv(r"C:\Users\naman\OneDrive\Desktop\varshu\IIT Hyderabad\SEM 1\FOML\ASSIGNMENT4\submission_rr.csv")
remove_col(r"C:\Users\naman\OneDrive\Desktop\varshu\IIT Hyderabad\SEM 1\FOML\ASSIGNMENT4\submission_rr.csv")

# Top 2 models are RandomForestRegressor & XGBRegressor and we obtain RMSE<4