# [New York City Taxi Fare Prediction](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)

## Import packages

In [7]:
import numpy as np 
import pandas as pd

## Import data

In [8]:
%%time
train = pd.read_csv("data/train5.csv", nrows=1_000)
test = pd.read_csv("data/test5.csv")

CPU times: user 78.6 ms, sys: 78.1 ms, total: 157 ms
Wall time: 224 ms


In [9]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2009-06-15 17:26:21.000000100,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,0.002701,0.009041,...,0,0,0,0,Borough of Queens,New York,Queens County,Borough of Queens,New York,Queens County
1,2010-01-05 16:52:16.000000200,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,0.03678,0.070701,...,0,0,0,0,New York City,New York,,Manhattan,New York,New York County
2,2011-08-18 00:35:00.000000490,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,0.008504,0.010708,...,0,0,0,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
3,2012-04-21 04:30:42.000000100,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,0.004437,0.024949,...,0,0,1,0,New York City,New York,,Weehawken,New Jersey,Hudson County
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,0.01144,0.015754,...,0,0,0,0,Manhattan,New York,New York County,Manhattan,New York,New York County


## Convert data type

In [10]:
train['fare_amount'] = train['fare_amount'].astype('float32')

In [11]:
%%time
def convert_dtype(df):
    df['pickup_datetime'] = df['pickup_datetime'].astype('str')
    
    float32_list = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'abs_long_diff', 'abs_lat_diff', 'manhattan_dist', 'sqrt_long_diff', 'sqrt_lat_diff', 'euclidean_dist', 'geodesic_km_dist', 'great_circle_km_dist', 'haversine_dist', 'center_lat', 'center_long']
    for col in float32_list:
        df[col] = df[col].astype('float32')
    
    uint16_list = ['year', 'day_of_year']
    for col in uint16_list:
        df[col] = df[col].astype('uint16')
    
    uint8_list = ['passenger_count', 'month', 'week', 'day', 'hour', 'minute', 'day_of_week', 'quarter', 'part_of_day', 'is_weekday', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end', 'is_leap_year', 'is_holiday']
    for col in uint8_list:
        df[col] = df[col].astype('uint8')
    
    category_list = ['pickup_place', 'pickup_admin1', 'pickup_admin2', 'dropoff_place', 'dropoff_admin1', 'dropoff_admin2']
    for col in category_list:
        df[col] = df[col].astype('category')
    
    return df.dtypes

convert_dtype(train)
convert_dtype(test)

CPU times: user 58.3 ms, sys: 6.61 ms, total: 64.9 ms
Wall time: 64.3 ms


key                       object
pickup_datetime           object
pickup_longitude         float32
pickup_latitude          float32
dropoff_longitude        float32
dropoff_latitude         float32
passenger_count            uint8
abs_long_diff            float32
abs_lat_diff             float32
manhattan_dist           float32
sqrt_long_diff           float32
sqrt_lat_diff            float32
euclidean_dist           float32
geodesic_km_dist         float32
great_circle_km_dist     float32
haversine_dist           float32
center_lat               float32
center_long              float32
year                      uint16
month                      uint8
week                       uint8
day                        uint8
hour                       uint8
minute                     uint8
day_of_week                uint8
day_of_year               uint16
quarter                    uint8
part_of_day                uint8
is_weekday                 uint8
is_weekend                 uint8
is_month_s

In [12]:
data = [train, test]
for df in data:
    print(df.shape)

(1000, 45)
(9914, 44)


In [13]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,sqrt_long_diff,...,is_weekday,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,is_leap_year,is_holiday
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,11.469941,-72.126953,39.732525,-72.198685,39.773224,1.592,0.096561,0.061415,0.157977,5.47846,...,0.725,0.275,0.028,0.046,0.01,0.012,0.002,0.001,0.161,0.029
std,9.499794,11.555398,6.365537,11.327382,6.240139,1.222715,2.339792,1.287459,3.627159,173.191711,...,0.446738,0.446738,0.165055,0.20959,0.099549,0.10894,0.044699,0.031623,0.367715,0.16789
min,2.5,-74.438232,0.0,-74.185997,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,-73.993042,40.73531,-73.991827,40.732977,1.0,0.005713,0.006682,0.01552,3.3e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.5,-73.982136,40.752874,-73.980247,40.753925,1.0,0.012533,0.013136,0.026915,0.000157,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,13.0,-73.966591,40.76669,-73.962532,40.767706,2.0,0.025455,0.026164,0.051129,0.000648,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,66.300003,0.0,40.993259,0.0,40.881878,6.0,74.005432,40.726685,114.732117,5476.804199,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Categorical Data

https://www.kaggle.com/c/home-credit-default-risk/discussion/58950
<br>https://www.kaggle.com/mlisovyi/beware-of-categorical-features-in-lgbm
<br>https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features

In [15]:
traincv = train.iloc[:100, :]
traincv.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2009-06-15 17:26:21.000000100,4.5,2009-06-15 17:26:21+00:00,-73.844315,40.721317,-73.841614,40.712276,1,0.002701,0.009041,...,0,0,0,0,Borough of Queens,New York,Queens County,Borough of Queens,New York,Queens County
1,2010-01-05 16:52:16.000000200,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782005,1,0.03678,0.070701,...,0,0,0,0,New York City,New York,,Manhattan,New York,New York County
2,2011-08-18 00:35:00.000000490,5.7,2011-08-18 00:35:00+00:00,-73.982735,40.761269,-73.991241,40.750561,2,0.008504,0.010708,...,0,0,0,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
3,2012-04-21 04:30:42.000000100,7.7,2012-04-21 04:30:42+00:00,-73.987129,40.733143,-73.99157,40.758091,1,0.004437,0.024949,...,0,0,1,0,New York City,New York,,Weehawken,New Jersey,Hudson County
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968094,40.768009,-73.956657,40.783764,1,0.01144,0.015754,...,0,0,0,0,Manhattan,New York,New York County,Manhattan,New York,New York County


In [16]:
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'abs_long_diff', 'abs_lat_diff', 'manhattan_dist',
       'sqrt_long_diff', 'sqrt_lat_diff', 'euclidean_dist', 'geodesic_km_dist',
       'great_circle_km_dist', 'haversine_dist', 'center_lat', 'center_long',
       'year', 'month', 'week', 'day', 'hour', 'minute', 'day_of_week',
       'day_of_year', 'quarter', 'part_of_day', 'is_weekday', 'is_weekend',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_year_start', 'is_year_end', 'is_leap_year', 'is_holiday',
       'pickup_place', 'pickup_admin1', 'pickup_admin2', 'dropoff_place',
       'dropoff_admin1', 'dropoff_admin2']
target = ['fare_amount']

In [17]:
X_features = train.iloc[:, :][features]
X_target = train.iloc[:, :][target]

X_featurescv = traincv.iloc[:, :][features]
X_targetcv = traincv.iloc[:, :][target]

In [18]:
y_features = test.iloc[:, :][features]

In [19]:
X_features.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,sqrt_long_diff,sqrt_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,-73.844315,40.721317,-73.841614,40.712276,1,0.002701,0.009041,0.011742,7e-06,8.2e-05,...,0,0,0,0,Borough of Queens,New York,Queens County,Borough of Queens,New York,Queens County
1,-74.016045,40.711304,-73.979271,40.782005,1,0.03678,0.070701,0.107481,0.001353,0.004999,...,0,0,0,0,New York City,New York,,Manhattan,New York,New York County
2,-73.982735,40.761269,-73.991241,40.750561,2,0.008504,0.010708,0.019212,7.2e-05,0.000115,...,0,0,0,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
3,-73.987129,40.733143,-73.99157,40.758091,1,0.004437,0.024949,0.029386,2e-05,0.000622,...,0,0,1,0,New York City,New York,,Weehawken,New Jersey,Hudson County
4,-73.968094,40.768009,-73.956657,40.783764,1,0.01144,0.015754,0.027194,0.000131,0.000248,...,0,0,0,0,Manhattan,New York,New York County,Manhattan,New York,New York County


In [20]:
X_featurescv.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,sqrt_long_diff,sqrt_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,-73.844315,40.721317,-73.841614,40.712276,1,0.002701,0.009041,0.011742,7e-06,8.2e-05,...,0,0,0,0,Borough of Queens,New York,Queens County,Borough of Queens,New York,Queens County
1,-74.016045,40.711304,-73.979271,40.782005,1,0.03678,0.070701,0.107481,0.001353,0.004999,...,0,0,0,0,New York City,New York,,Manhattan,New York,New York County
2,-73.982735,40.761269,-73.991241,40.750561,2,0.008504,0.010708,0.019212,7.2e-05,0.000115,...,0,0,0,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
3,-73.987129,40.733143,-73.99157,40.758091,1,0.004437,0.024949,0.029386,2e-05,0.000622,...,0,0,1,0,New York City,New York,,Weehawken,New Jersey,Hudson County
4,-73.968094,40.768009,-73.956657,40.783764,1,0.01144,0.015754,0.027194,0.000131,0.000248,...,0,0,0,0,Manhattan,New York,New York County,Manhattan,New York,New York County


In [21]:
y_features.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,sqrt_long_diff,sqrt_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,-73.97332,40.763805,-73.98143,40.743835,1,0.00811,0.01997,0.02808,6.6e-05,0.000399,...,0,0,0,0,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,-73.986862,40.719383,-73.998886,40.739201,1,0.012024,0.019817,0.031841,0.000145,0.000393,...,0,0,0,0,New York City,New York,,New York City,New York,
2,-73.982521,40.751259,-73.979652,40.74614,1,0.00287,0.005121,0.007991,8e-06,2.6e-05,...,0,0,0,0,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,-73.981163,40.767807,-73.990448,40.751637,1,0.009288,0.016172,0.02546,8.6e-05,0.000262,...,0,0,1,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,-73.966049,40.789776,-73.988564,40.744427,1,0.022519,0.045348,0.067867,0.000507,0.002056,...,0,0,1,0,Manhattan,New York,New York County,New York City,New York,


# One-hot encoding

In [None]:
#X_features_encoded = pd.get_dummies(X_features, columns=['pickup_place', 'dropoff_place'])
#y_features_encoded = pd.get_dummies(y_features, columns=['pickup_place', 'dropoff_place'])

In [None]:
#X_set_features = set(X_features_encoded.columns)
#y_set_features = set(y_features_encoded.columns)
#print(len(X_set_features))
#print(len(y_set_features))
#X_set_features == y_set_features

In [None]:
#y_reindexed = y_features_encoded.reindex(columns = X_features_encoded.columns, fill_value = 0)

In [None]:
#X_features_encoded.columns == y_reindexed.columns

In [None]:
#X_features_encoded.head()

In [None]:
#y_reindexed.head()

# lgb Model
## GridSearchCV

https://www.kaggle.com/garethjns/microsoft-lightgbm-with-parameter-tuning-0-823
<br>https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features

In [22]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X_features, X_target, test_size=0.3, random_state=0)
train_Xcv, test_Xcv, train_ycv, test_ycv = train_test_split(X_featurescv, X_targetcv, test_size=0.3, random_state=0)

In [24]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

In [25]:
#dataset
lgb_train = lgb.Dataset(train_X, train_y,
#                        categorical_feature=features, 
                        silent=True, 
                        free_raw_data=False)
lgb_eval = lgb.Dataset(test_X, test_y, reference=lgb_train)

#dataset
lgb_traincv = lgb.Dataset(train_Xcv, train_ycv,
#                        categorical_feature=features, 
                        silent=True, 
                        free_raw_data=False)
lgb_evalcv = lgb.Dataset(test_Xcv, test_ycv, reference=lgb_train)

In [26]:
def lgb_gridmodel(train_X, test_X, train_y, test_y):
    import lightgbm as lgb
    from sklearn.model_selection import GridSearchCV
    #dataset
    lgb_train = lgb.Dataset(train_X, train_y)
    lgb_eval = lgb.Dataset(test_X, test_y, reference=lgb_train)
    #基本配備，初始參數boosting_type, objective, metric
    global params
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
    }
    #cv
    min_merror = float('Inf')
    global best_params
    best_params = {}
    #num_leaves, max_depth
    for num_leaves in range(20,200,5):
        for max_depth in range(3,8,1):
            params['num_leaves'] = num_leaves
            params['max_depth'] = max_depth
            cv_results = lgb.cv(
                params,
                lgb_train,
                seed=2019,
                stratified=False, 
                nfold=5,
                metrics=['rmse'],
                early_stopping_rounds=10,
                verbose_eval=50
            )
        
            mean_merror = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).argmin()
            
            if mean_merror < min_merror:
                min_merror = mean_merror
                best_params['num_leaves'] = num_leaves
                best_params['max_depth'] = max_depth
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
    
    #max_bin, min_child_samples, min_child_weight(沒調)
    for max_bin in range(5,255,5):
        for min_data_in_leaf in range(10,200,5):
            for min_child_weight in [0.001, 0.002, 0.003, 0.004, 0.005]:
                params['max_bin'] = max_bin
                params['min_data_in_leaf'] = min_data_in_leaf
                params['min_child_weight'] = min_child_weight
                cv_results = lgb.cv(
                    params,
                    lgb_train,
                    seed=42,
                    stratified=False,
                    nfold=5,
                    early_stopping_rounds=3,
                    verbose_eval=50
                )
                mean_merror = pd.Series(cv_results['rmse-mean']).min()
                boost_rounds = pd.Series(cv_results['rmse-mean']).argmin()

                if mean_merror < min_merror:
                    min_merror = mean_merror
                    best_params['max_bin'] = max_bin
                    best_params['min_data_in_leaf'] = min_data_in_leaf
                    best_params['min_child_weight'] = min_child_weight
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['min_child_weight'] = best_params['min_child_weight']
    
    #feature_fraction, bagging_fraction, bagging_freq
    for feature_fraction in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
        for bagging_fraction in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
            for bagging_freq in range(0,50,5):
                params['feature_fraction'] = feature_fraction
                params['bagging_fraction'] = bagging_fraction
                params['bagging_freq'] = bagging_freq
                cv_results = lgb.cv(
                    params,
                    lgb_train,
                    seed=42,
                    stratified=False,
                    nfold=5,
                    metrics=['rmse'],
                    early_stopping_rounds=3,
                    verbose_eval=50
                )
                mean_merror = pd.Series(cv_results['rmse-mean']).min()
                boost_rounds = pd.Series(cv_results['rmse-mean']).argmin()

                if mean_merror < min_merror:
                    min_merror = mean_merror
                    best_params['feature_fraction'] = feature_fraction
                    best_params['bagging_fraction'] = bagging_fraction
                    best_params['bagging_freq'] = bagging_freq
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']
    
    #lambda_l1, lambda_l2, min_split_gain
    for lambda_l1 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
        for lambda_l2 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
            for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
                params['lambda_l1'] = lambda_l1
                params['lambda_l2'] = lambda_l2
                params['min_split_gain'] = min_split_gain
                cv_results = lgb.cv(
                    params,
                    lgb_train,
                    seed=42,
                    stratified=False,
                    nfold=5,
                    metrics=['rmse'],
                    early_stopping_rounds=3,
                    verbose_eval=50
                )
                mean_merror = pd.Series(cv_results['rmse-mean']).min()
                boost_rounds = pd.Series(cv_results['rmse-mean']).argmin()
            
                if mean_merror < min_merror:
                    min_merror = mean_merror
                    best_params['lambda_l1'] = lambda_l1
                    best_params['lambda_l2'] = lambda_l2
                    best_params['min_split_gain'] = min_split_gain
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']
    params['min_split_gain'] = best_params['min_split_gain']
    return best_params

In [None]:
%%time
lgb_gridmodel(train_Xcv, test_Xcv, train_ycv, test_ycv)

The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.


In [None]:
best_params

In [None]:
params['learning_rate']=0.4
params

In [None]:
gbm = lgb.train(
    params,                     
    lgb_train,                  
    valid_sets=lgb_eval,       
    num_boost_round=300,    
    early_stopping_rounds=50
)

In [None]:
y_pred_lgb = gbm.predict(y_features, num_iteration = gbm.best_iteration)

In [None]:
#feature importance
%matplotlib inline
import matplotlib.pyplot as plt

ax = lgb.plot_importance(gbm)

In [None]:
submission = pd.DataFrame()
submission['key'] = test.key
submission['fare_amount'] = y_pred_lgb
submission.to_csv('submission_lgbgrid.csv', index = False)

In [None]:
ss = pd.read_csv('../input/sample_submission.csv')
#ss = pd.read_csv('sample_submission.csv')
ss.loc[:, 'fare_amount'] = y_pred_lgb

ss.to_csv('submission_lgbgrid.csv', index=False)

# top

In [None]:
top_params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'verbose': 0,
        'num_leaves': 31,
        'learning_rate': 0.4,
        'max_depth': -1,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'reg_aplha': 1,
        'reg_lambda': 0.001,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'verbose':0
    }

In [None]:
gbmtop = lgb.train(
    top_params,                     
    lgb_train,                  
    valid_sets=lgb_eval,       
    num_boost_round=300,    
    early_stopping_rounds=50
)

In [None]:
y_pred_lgbtop = gbmtop.predict(y_features, num_iteration = gbm.best_iteration)

In [None]:
#feature importance
%matplotlib inline
import matplotlib.pyplot as plt

ax = lgb.plot_importance(gbmtop)

In [None]:
submission = pd.DataFrame()
submission['key'] = test.key
submission['fare_amount'] = y_pred_lgbtop
submission.to_csv('submission_lgbtop.csv', index = False)

In [None]:
ss = pd.read_csv('../input/sample_submission.csv')
#ss = pd.read_csv('sample_submission.csv')
ss.loc[:, 'fare_amount'] = y_pred_lgbtop

ss.to_csv('submission_lgbtop.csv', index=False)