In [1]:
from pandas import read_feather, read_csv
from pandas import DataFrame
import numpy as np
#from math import radians, cos, sin, asin, sqrt

In [2]:
def distance(lat1, lng1, lat2, lng2):
    #return distance as meter if you want km distance, remove "* 1000"
    radius = 6371

    dLat = (lat2-lat1) * np.pi / 180
    dLng = (lng2-lng1) * np.pi / 180

    lat1 = lat1 * np.pi / 180
    lat2 = lat2 * np.pi / 180

    val = np.sin(dLat/2) * np.sin(dLat/2) + np.sin(dLng/2)\
    * np.sin(dLng/2) * np.cos(lat1) * np.cos(lat2)    
    ang = 2 * np.arctan2(np.sqrt(val), np.sqrt(1-val))
    return radius * ang

# Test Sample

In [3]:
# No missing values
test_df = read_csv('test.csv', parse_dates=["pickup_datetime"],
                  infer_datetime_format=True)
print(test_df.shape)
test_df.head()

(9914, 7)


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12,-73.966046,40.789775,-73.988565,40.744427,1


In [4]:
test_df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974722,40.751041,-73.973657,40.751743,1.671273
std,0.042774,0.033541,0.039072,0.035435,1.278747
min,-74.252193,40.573143,-74.263242,40.568973,1.0
25%,-73.992501,40.736125,-73.991247,40.735254,1.0
50%,-73.982326,40.753051,-73.980015,40.754065,1.0
75%,-73.968013,40.767113,-73.964059,40.768757,2.0
max,-72.986532,41.709555,-72.990963,41.696683,6.0


In [5]:
lng1min = test_df.pickup_longitude.min()
lng2min = test_df.dropoff_longitude.min()
lat1min = test_df.pickup_latitude.min()
lat2min = test_df.dropoff_latitude.min()
#
lng1max = test_df.pickup_longitude.max()
lng2max = test_df.dropoff_longitude.max()
lat1max = test_df.pickup_latitude.max()
lat2max = test_df.dropoff_latitude.max()

# Train Sample 1

In [33]:
%%time
# Read a subsample (train1) of the original dataset with 55M rows
# It was obtained ramdomly using the command line subsample task:
# >subsample --reservoir -n 2000000 train.csv -r > train1.csv
train_df = read_feather('tmp/train1.feather')
train_df.head()

CPU times: user 78.8 ms, sys: 102 ms, total: 181 ms
Wall time: 683 ms


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 7 columns):
fare_amount          float32
pickup_datetime      datetime64[ns]
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count      uint8
dtypes: datetime64[ns](1), float32(5), uint8(1)
memory usage: 55.3 MB


In [22]:
def transform(train_df):
    
    # Remove missing values
    train_df = train_df.dropna(how = 'any', axis = 'rows')

    # Remove absurd passenger_count
    train_df = train_df[(train_df['passenger_count'] >= 1) &
                    (train_df['passenger_count'] <= 6)]

    # Remove negative and extreme fare_amount values
    train_df = train_df[(train_df['fare_amount'] >= 2.5) & (train_df['fare_amount'] <= 200)]
    print('Maximum fare_amount: %.1f' % train_df['fare_amount'].max())

    # Remove no displacements
    train_df = train_df[(train_df['pickup_latitude'] != train_df['dropoff_latitude'])]
    train_df = train_df[(train_df['pickup_longitude'] != train_df['dropoff_longitude'])]

    # Remove absurd displacements
    train_df = train_df[(train_df['pickup_longitude'] >= lng1min) & (train_df['pickup_longitude'] <= lng1max)]
    train_df = train_df[(train_df['dropoff_longitude'] >= lng2min) & (train_df['dropoff_longitude'] <= lng2max)]
    train_df = train_df[(train_df['pickup_latitude'] >= lat1min) & (train_df['pickup_latitude'] <= lat1max)]
    train_df = train_df[(train_df['dropoff_latitude'] >= lat2min) & (train_df['dropoff_latitude'] <= lat2max)]

    # Create new features - distance
    train_df['dist'] = distance(train_df['pickup_latitude'], train_df['pickup_longitude'],
                                train_df['dropoff_latitude'], train_df['dropoff_longitude'])
    train_df = train_df[train_df['dist'] <= 100]

    # Create new features - dayofweek,hour,month,year
    train_df['dayofweek'] = train_df['pickup_datetime'].dt.dayofweek.astype('uint8')
    train_df['hour'] = train_df['pickup_datetime'].dt.hour.astype('uint8')
    train_df['month'] = train_df['pickup_datetime'].dt.month.astype('uint8')
    train_df['year'] = train_df['pickup_datetime'].dt.year.astype('uint16')

    # Create dataframes for the two periods
    P1 = train_df[(train_df['pickup_datetime'] < '2012-09-01')]
    P1 = P1.drop(['pickup_datetime'], axis=1)
    print(P1.shape)
    P2 = train_df.loc[(train_df['pickup_datetime'] >= '2012-09-01')]
    P2 = P2.drop(['pickup_datetime'], axis=1)
    print(P2.shape)

    # Save memory
    print(train_df.shape)
    print('Maximum ride distance: %.1f' % train_df['dist'].max())
    del train_df
    return P1,P2

In [34]:
P1, P2 = transform(train_df)

Maximum fare_amount: 200.0
(1099583, 11)
(827716, 11)
(1927299, 12)
Maximum ride distance: 99.9


In [10]:
P1.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist,dayofweek,hour,month,year
0,34.669998,-73.870819,40.773991,-73.999054,40.760658,1,10.900634,4,18,10,2009
2,3.3,-73.980865,40.7505,-73.981079,40.755962,1,0.607685,5,1,2,2009
3,7.3,-73.979965,40.74334,-73.988792,40.759567,1,1.951639,0,20,10,2011
8,4.1,-74.00222,40.738979,-74.003189,40.7323,1,0.747204,4,11,8,2010
10,12.1,-73.982155,40.772606,-73.968719,40.751492,1,2.606266,3,8,7,2012


In [11]:
P1.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist,dayofweek,hour,month,year
count,1099583.0,1099583.0,1099583.0,1099583.0,1099583.0,1099583.0,1099583.0,1099583.0,1099583.0,1099583.0,1099583.0
mean,10.24572,-73.97575,40.75126,-73.97484,40.75154,1.683421,3.310216,3.0238,13.51862,6.141853,2010.38
std,8.275364,0.03534235,0.02762408,0.03443825,0.03080789,1.265207,3.677031,1.94243,6.499488,3.34892,1.075736
min,2.5,-74.25201,40.57338,-74.26089,40.56955,1.0,0.0007677801,0.0,0.0,1.0,2009.0
25%,5.7,-73.99228,40.73673,-73.99161,40.73595,1.0,1.273078,1.0,9.0,3.0,2009.0
50%,7.7,-73.98211,40.75352,-73.98071,40.75399,1.0,2.168129,3.0,14.0,6.0,2010.0
75%,11.3,-73.96867,40.76762,-73.96612,40.7684,2.0,3.908413,5.0,19.0,9.0,2011.0
max,191.8,-72.992,41.62006,-73.0232,41.58635,6.0,99.933,6.0,23.0,12.0,2012.0


In [12]:
P2.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist,dayofweek,hour,month,year
count,827716.0,827716.0,827716.0,827716.0,827716.0,827716.0,827716.0,827716.0,827716.0,827716.0,827716.0
mean,12.710736,-73.975067,40.750645,-73.974289,40.751186,1.703338,3.42526,3.061223,13.514988,6.441434,2013.554437
std,10.687953,0.035355,0.027186,0.034462,0.031143,1.362176,3.717142,1.961957,6.528371,3.544804,0.903094
min,2.5,-74.240608,40.573578,-74.262383,40.569691,1.0,0.000769,0.0,0.0,1.0,2012.0
25%,6.5,-73.992294,40.736401,-73.99157,40.735371,1.0,1.288051,1.0,9.0,3.0,2013.0
50%,9.5,-73.982101,40.7533,-73.980469,40.753792,1.0,2.192942,3.0,14.0,6.0,2014.0
75%,14.5,-73.968132,40.76757,-73.964691,40.768459,2.0,3.993166,5.0,19.0,10.0,2014.0
max,200.0,-73.064941,41.383331,-72.993538,41.383331,6.0,88.192757,6.0,23.0,12.0,2015.0


# Model P1

In [40]:
from sklearn.model_selection import train_test_split

X = P1.iloc[:,1:].values
y = np.log10(P1.iloc[:,0].values)

seed = 101

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3, random_state=seed)

In [49]:
%%time
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score

modelP1 = ExtraTreesRegressor(random_state=seed, n_estimators=15,
                              max_depth = 15, n_jobs = -1, warm_start=True)

modelP1.fit(X_train,y_train)

scores = cross_val_score(modelP1, X_train, y_train, cv=3, scoring=make_scorer(mean_squared_error))
print('RMSE CV: %.3f +/- %.3f' % (np.sqrt(np.mean(scores)), np.sqrt(np.std(scores))))

y_pred=modelP1.predict(X_test)
print('RMSE before gridsearch: %.3f' % np.sqrt(mean_squared_error(y_pred,y_test)).round(3))

RMSE CV: 0.107 +/- 0.008
RMSE before gridsearch: 0.106
CPU times: user 4min 23s, sys: 3.98 s, total: 4min 27s
Wall time: 37.1 s


## GridSearch

In [45]:
%%time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_dist = {"max_features": [9,10],
              "min_samples_split": [5,6,7],
              "min_samples_leaf": [1,2,3]}

modelP1 = ExtraTreesRegressor(random_state=seed, n_estimators=10,
                              max_depth = 15, n_jobs=-1)

rsearch = RandomizedSearchCV(modelP1, param_distributions=param_dist,
                             n_jobs=-1, n_iter=18, verbose=1)
rsearch.fit(X_train,y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  5.6min finished


CPU times: user 1min 5s, sys: 615 ms, total: 1min 5s
Wall time: 5min 43s


In [46]:
modelP1=rsearch.best_estimator_
print(modelP1)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=15,
          max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=2, min_samples_split=6,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
          oob_score=False, random_state=101, verbose=0, warm_start=False)


In [50]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score

modelP1 = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=15,
          max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=2, min_samples_split=6,
          min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
          oob_score=False, random_state=101, verbose=0, warm_start=False)

modelP1.fit(X_train,y_train)
scores = cross_val_score(modelP1, X_train, y_train, cv=3, scoring=make_scorer(mean_squared_error))
print('RMSE CV: %.3f +/- %.3f' % (np.sqrt(np.mean(scores)), np.sqrt(np.std(scores))))
# Best model applyied on the test set
y_pred=modelP1.predict(X_test)
print('RMSE after gridsearch: %.3f' % np.sqrt(mean_squared_error(y_pred,y_test)).round(3))

RMSE CV: 0.107 +/- 0.005
RMSE after gridsearch: 0.106


# Model P2

In [15]:
from sklearn.model_selection import train_test_split

X = P2.iloc[:,1:].values
y = np.log10(P2.iloc[:,0].values)

seed = 101

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3, random_state=seed)

In [16]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score

modelP2 = ExtraTreesRegressor(random_state=seed, n_estimators=15,
                              max_depth = 20, n_jobs = -1, warm_start=False)
modelP2.fit(X_train,y_train)

scores = cross_val_score(modelP2, X_train, y_train, cv=3, scoring=make_scorer(mean_squared_error))
print('RMSE CV: %.3f +/- %.3f' % (np.sqrt(np.mean(scores)), np.sqrt(np.std(scores))))

y_pred=modelP2.predict(X_test)
print('RMSE: %.3f' % np.sqrt(mean_squared_error(y_pred,y_test)).round(3))

RMSE CV: 0.103 +/- 0.011
RMSE: 0.103


## GridSearch

In [None]:
%%time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_dist = {"max_features": [9,10],
              "min_samples_split": [5,6,7],
              "min_samples_leaf": [1,2,3]}

rsearch = RandomizedSearchCV(modelP2, param_distributions=param_dist, n_jobs=-1, n_iter=18)
rsearch.fit(X_train,y_train)

In [None]:
modelP2=rsearch.best_estimator_
print(modelP2)

In [None]:
modelP2 = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=25,
          max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=1, min_samples_split=6,
          min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
          oob_score=False, random_state=101, verbose=0, warm_start=False)

modelP2.fit(X_train,y_train)
#scores = cross_val_score(modelP2, X_train, y_train, cv=3, scoring=make_scorer(mean_squared_error))
#print('RMSE CV: %.3f +/- %.3f' % (np.sqrt(np.mean(scores)), np.sqrt(np.std(scores))))
# Best model applyied on the test set
y_pred=modelP2.predict(X_test)
print(np.sqrt(mean_squared_error(y_pred,y_test)).round(3))

# Train Sample 2

In [27]:
# Read and transform
train_df = read_feather('tmp/train2.feather')
P1, P2 = transform(train_df)

Maximum fare_amount: 200.0
(1099464, 11)
(828094, 11)
(1927558, 12)
Maximum ride distance: 100.0


## P1

In [28]:
X = P1.iloc[:,1:].values
y = np.log10(P1.iloc[:,0].values)

modelP1.set_params(n_estimators=10, random_state=101, n_jobs = -1, warm_start=True)
modelP1.fit(X, y)
y_pred=modelP1.predict(X_test)
print(np.sqrt(mean_squared_error(y_pred,y_test)).round(3))

0.103

## P2

In [None]:
X = P2.iloc[:,1:].values
y = P2.iloc[:,0].values

modelP2.set_params(n_estimators=30, random_state=101, n_jobs = -1, warm_start=True)
modelP2.fit(X, y)
y_pred=modelP2.predict(X_test)
np.sqrt(mean_squared_error(y_pred,y_test)).round(3)

# Train Sample 3

In [29]:
# Read and transform
train_df = read_feather('tmp/train3.feather')
P1, P2 = transform(train_df)

Maximum fare_amount: 200.0
(1099481, 11)
(827689, 11)
(1927170, 12)
Maximum ride distance: 100.0


## P1

In [30]:
X = P1.iloc[:,1:].values
y = np.log10(P1.iloc[:,0].values)

modelP1.set_params(n_estimators=15, random_state=101, n_jobs = -1, warm_start=True)
modelP1.fit(X, y)
y_pred=modelP1.predict(X_test)
print(np.sqrt(mean_squared_error(y_pred,y_test)).round(3))

0.103


## P2

In [None]:
X = P2.iloc[:,1:].values
y = P2.iloc[:,0].values

modelP2.set_params(n_estimators=35, random_state=101, n_jobs = -1, warm_start=True)
modelP2.fit(X, y)
y_pred=modelP2.predict(X_test)
np.sqrt(mean_squared_error(y_pred,y_test)).round(3)

# Train Sample 4

In [31]:
# Read and transform
train_df = read_feather('tmp/train4.feather')
P1, P2 = transform(train_df)

Maximum fare_amount: 200.0
(1097163, 11)
(830322, 11)
(1927485, 12)
Maximum ride distance: 100.0


## P1

In [32]:
X = P1.iloc[:,1:].values
y = np.log10(P1.iloc[:,0].values)

modelP1.set_params(n_estimators=20, random_state=101, n_jobs = -1, warm_start=True)
modelP1.fit(X, y)
y_pred=modelP1.predict(X_test)
print(np.sqrt(mean_squared_error(y_pred,y_test)).round(3))

0.782


## P2

In [None]:
X = P2.iloc[:,1:].values
y = P2.iloc[:,0].values

modelP2.set_params(n_estimators=40, random_state=101, n_jobs = -1, warm_start=True)
modelP2.fit(X, y)
y_pred=modelP2.predict(X_test)
np.sqrt(mean_squared_error(y_pred,y_test)).round(3)

# Train Sample 5

In [None]:
# Read and transform
train_df = read_feather('tmp/train5.feather')
P1, P2 = transform(train_df)

## P1

In [None]:
X = P1.iloc[:,1:].values
y = P1.iloc[:,0].values

modelP1.set_params(n_estimators=45, random_state=101, n_jobs = -1, warm_start=True)
modelP1.fit(X, y)
y_pred=modelP1.predict(X_test)
np.sqrt(mean_squared_error(y_pred,y_test)).round(3)

## P2

In [None]:
X = P2.iloc[:,1:].values
y = P2.iloc[:,0].values

modelP2.set_params(n_estimators=45, random_state=101, n_jobs = -1, warm_start=True)
modelP2.fit(X, y)
y_pred=modelP2.predict(X_test)
np.sqrt(mean_squared_error(y_pred,y_test)).round(3)

## Make predictions on the test set

In [None]:
# No missing values
test_df = read_csv('test.csv', parse_dates=["pickup_datetime"],
                  infer_datetime_format=True)

test_df['dist'] = distance(test_df['pickup_latitude'], test_df['pickup_longitude'],
                           test_df['dropoff_latitude'], test_df['dropoff_longitude'])

print('Maximum ride distance: %.1f' % test_df['dist'].max())

test_df['dayofweek'] = test_df['pickup_datetime'].dt.dayofweek #.astype('uint8')
test_df['hour'] = test_df['pickup_datetime'].dt.hour #.astype('uint8')
test_df['month'] = test_df['pickup_datetime'].dt.month #.astype('uint8')
test_df['year'] = test_df['pickup_datetime'].dt.year #.astype('uint8')
#test_df = test_df.drop(['passenger_count'], axis=1)
test_df.head()

In [None]:
X_testF = test_df.iloc[:,2:].values
y_predFP1 = modelP1.predict(X_testF).round(3)

In [None]:
#X_testF = test_df.iloc[:,[2,3,4,5,6,7,8,9,10,11]].values
#y_predFP1 = modelP1.predict(X_testF).round(3)
y_predFP2 = modelP2.predict(X_testF).round(3)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(y_predFP1, y_predFP2)
plt.xlabel('y_pred P1')
plt.ylabel('y_pred P2')
plt.xlim(0,150)
plt.ylim(0,150)
plt.show()

In [None]:
#y_predFP2 = np.where(y_predFP2 > 150, 100, y_predFP2)
#y_predFP2 = np.where((y_predFP2 > 90) & (y_predFP1 < 40), 35, y_predFP2)
#y_predFP2 = np.where((y_predFP2 > 40) & (y_predFP1 < 24), 11, y_predFP2)

In [None]:
#import matplotlib.pyplot as plt
#%matplotlib inline
#plt.scatter(y_predFP1, y_predFP2)
#plt.xlabel('y_pred P1')
#plt.ylabel('y_pred P2')
#plt.xlim(0,250)
#plt.ylim(0,250)
#plt.show()

In [None]:
submission = DataFrame({'key': test_df.key, 'fare_amountP1': y_predFP1,
                        'fare_amountP2': y_predFP2},
                       columns = ['key', 'fare_amountP1', 'fare_amountP2'])

submission['fare_amount'] = np.where(test_df['pickup_datetime'] < '2012-09-01',
                                     submission['fare_amountP1'],
                                     submission['fare_amountP2'])

submission = submission.drop(['fare_amountP1','fare_amountP2'], axis=1)
submission.to_csv('submission.csv', index = False)
submission.head()

In [None]:
testP1 = test_df[(test_df['pickup_datetime'] < '2012-09-01')]
print(testP1.shape)
testP2 = test_df[(test_df['pickup_datetime'] >= '2012-09-01')]
print(testP2.shape)