# [New York City Taxi Fare Prediction](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)

## Import packages

In [1]:
import numpy as np 
import pandas as pd

## Import data

In [6]:
%%time
train = pd.read_csv("data/train5.csv")
test = pd.read_csv("data/test5.csv")

CPU times: user 5min 52s, sys: 4min 17s, total: 10min 9s
Wall time: 13min 38s


In [7]:
train.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'abs_long_diff', 'abs_lat_diff', 'manhattan_dist',
       'sqrt_long_diff', 'sqrt_lat_diff', 'euclidean_dist', 'geodesic_km_dist',
       'great_circle_km_dist', 'haversine_dist', 'center_lat', 'center_long',
       'year', 'month', 'week', 'day', 'hour', 'minute', 'day_of_week',
       'day_of_year', 'quarter', 'part_of_day', 'is_weekday', 'is_weekend',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_year_start', 'is_year_end', 'is_leap_year', 'is_holiday',
       'pickup_place', 'pickup_admin1', 'pickup_admin2', 'dropoff_place',
       'dropoff_admin1', 'dropoff_admin2'],
      dtype='object')

## Convert data type

In [8]:
train.dtypes

key                      object
fare_amount             float64
pickup_datetime          object
pickup_longitude        float64
pickup_latitude         float64
dropoff_longitude       float64
dropoff_latitude        float64
passenger_count           int64
abs_long_diff           float64
abs_lat_diff            float64
manhattan_dist          float64
sqrt_long_diff          float64
sqrt_lat_diff           float64
euclidean_dist          float64
geodesic_km_dist        float64
great_circle_km_dist    float64
haversine_dist          float64
center_lat              float64
center_long             float64
year                      int64
month                     int64
week                      int64
day                       int64
hour                      int64
minute                    int64
day_of_week               int64
day_of_year               int64
quarter                   int64
part_of_day               int64
is_weekday                int64
is_weekend                int64
is_month

In [9]:
train['fare_amount'] = train['fare_amount'].astype('float32')

In [10]:
%%time
def convert_dtype(df):
    df['pickup_datetime'] = df['pickup_datetime'].astype('str')
    
    float32_list = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'abs_long_diff', 'abs_lat_diff', 'manhattan_dist', 'sqrt_long_diff', 'sqrt_lat_diff', 'euclidean_dist', 'geodesic_km_dist', 'great_circle_km_dist', 'haversine_dist', 'center_lat', 'center_long']
    for col in float32_list:
        df[col] = df[col].astype('float32')
    
    uint16_list = ['year', 'day_of_year']
    for col in uint16_list:
        df[col] = df[col].astype('uint16')
    
    uint8_list = ['passenger_count', 'month', 'week', 'day', 'hour', 'minute', 'day_of_week', 'quarter', 'part_of_day', 'is_weekday', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end', 'is_leap_year', 'is_holiday']
    for col in uint8_list:
        df[col] = df[col].astype('uint8')
    
    category_list = ['pickup_place', 'pickup_admin1', 'pickup_admin2', 'dropoff_place', 'dropoff_admin1', 'dropoff_admin2']
    for col in category_list:
        df[col] = df[col].astype('category')
    
    return df.dtypes

convert_dtype(train)
convert_dtype(test)

CPU times: user 3min 37s, sys: 7min 59s, total: 11min 37s
Wall time: 9min 32s


key                       object
pickup_datetime           object
pickup_longitude         float32
pickup_latitude          float32
dropoff_longitude        float32
dropoff_latitude         float32
passenger_count            uint8
abs_long_diff            float32
abs_lat_diff             float32
manhattan_dist           float32
sqrt_long_diff           float32
sqrt_lat_diff            float32
euclidean_dist           float32
geodesic_km_dist         float32
great_circle_km_dist     float32
haversine_dist           float32
center_lat               float32
center_long              float32
year                      uint16
month                      uint8
week                       uint8
day                        uint8
hour                       uint8
minute                     uint8
day_of_week                uint8
day_of_year               uint16
quarter                    uint8
part_of_day                uint8
is_weekday                 uint8
is_weekend                 uint8
is_month_s

In [12]:
train.shape

(55418166, 45)

In [13]:
test.shape

(9914, 44)

In [14]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2009-06-15 17:26:21.000000100,4.5,2009-06-15 17:26:21+00:00,-73.844315,40.721317,-73.841614,40.712276,1,0.002701,0.009041,...,0,0,0,0,Borough of Queens,New York,Queens County,Borough of Queens,New York,Queens County
1,2010-01-05 16:52:16.000000200,16.9,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782005,1,0.03678,0.070701,...,0,0,0,0,New York City,New York,,Manhattan,New York,New York County
2,2011-08-18 00:35:00.000000490,5.7,2011-08-18 00:35:00+00:00,-73.982735,40.761269,-73.991241,40.750561,2,0.008504,0.010708,...,0,0,0,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
3,2012-04-21 04:30:42.000000100,7.7,2012-04-21 04:30:42+00:00,-73.987129,40.733143,-73.99157,40.758091,1,0.004437,0.024949,...,0,0,1,0,New York City,New York,,Weehawken,New Jersey,Hudson County
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968094,40.768009,-73.956657,40.783764,1,0.01144,0.015754,...,0,0,0,0,Manhattan,New York,New York County,Manhattan,New York,New York County


In [15]:
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2015-01-27 13:08:24.000000200,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,0.00811,0.01997,0.02808,...,0,0,0,0,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,2015-01-27 13:08:24.000000300,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,0.012024,0.019817,0.031841,...,0,0,0,0,New York City,New York,,New York City,New York,
2,2011-10-08 11:53:44.000000200,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.74614,1,0.00287,0.005121,0.007991,...,0,0,0,0,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,2012-12-01 21:12:12.000000200,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751637,1,0.009288,0.016172,0.02546,...,0,0,1,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,2012-12-01 21:12:12.000000300,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1,0.022519,0.045348,0.067867,...,0,0,1,0,Manhattan,New York,New York County,New York City,New York,


In [None]:
def print_null(df):
    """
    prints null value of the given data
    """
    print(df.isnull().sum()[df.isnull().sum() != 0])

print_null(train)
print("------------")
print_null(test)

# Model

In [None]:
train.columns

In [None]:
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
            'passenger_count', 'month', 'week', 'time_interval', 'weekend', 'holiday', 
            'pickup_place', 'dropoff_place', 'area_crossing', 
            'linear_distance']
target = ['fare_amount']

In [None]:
X_features = train.iloc[:, :][features]
X_target = train.iloc[:, :][target]

y_features = test.iloc[:, :][features]

In [None]:
X_features.head()

https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/

In [None]:
categorical_features_indices = np.where(X_features.dtypes != np.float)[0]

In [None]:
categorical_features_indices

## Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X_features, X_target, test_size=0.3, random_state=0)
X_test = y_features

# CatBoost模型

In [None]:
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import accuracy_score

In [None]:
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [None]:
#default parameters
model = CatBoostRegressor(
    num_boost_round=700,
    learning_rate=0.4, #default 0.03
    depth=10, #default 6
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=0,
#    logging_level='Verbose',
    use_best_model=True,
    od_type='IncToDec',
    od_pval=0.01,
    od_wait=40
)

In [None]:
%%time
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=validate_pool,
#    logging_level='Verbose',
    plot=True #可看圖!!!
);

In [None]:
'''
cv_params = model.get_params() 
cv_params.update({
    'loss_function': 'RSME' #改loss_function
})

#進行cross validation，採用別的loss function
cv_data = cv(
    train_pool,
    cv_params,
    plot=True
) 
'''

https://tech.yandex.com/catboost/doc/dg/concepts/loss-functions-docpage/#loss-functions

In [None]:
y_pred_cb = model.predict(X_test)

In [None]:
feature_importances = model.get_feature_importance(train_pool)

feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

In [None]:
submission = pd.DataFrame()
submission['key'] = test.key
submission['fare_amount'] = y_pred_cb
submission.to_csv('submission_cb010.csv', index = False)

In [None]:
ss = pd.read_csv('../input/new-york-city-taxi-fare-prediction/sample_submission.csv')
#ss = pd.read_csv('sample_submission.csv')
ss.loc[:, 'fare_amount'] = y_pred_cb

ss.to_csv('submission_cb010.csv', index=False)