In [None]:
import pandas as pd
import numpy as np
import zipfile
import datetime
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV

import xgboost

In [None]:
REP_DATA = Path('data')
REP_DATA.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile('nyc-taxi-trip-duration.zip', 'r') as zip_f:
    zip_f.extractall(REP_DATA)
    for f in REP_DATA.glob('*.zip'):
        with zipfile.ZipFile(f, 'r') as sub_zip_f:
            sub_zip_f.extractall(REP_DATA)

In [None]:
df = pd.read_csv(
    REP_DATA / 'train.csv',
    # index_col='id',
    # parse_dates=['pickup_datetime', 'dropoff_datetime'],
    parse_dates=['pickup_datetime'],
    dtype={'store_and_fwd_flag':'category'}
)

df = df.drop(columns=['dropoff_datetime'])  # Not in test.csv

# df = df.head(10000)

In [None]:
LFV = 'trip_duration'  # Looking for value

In [None]:
df.info()

In [None]:
df.head()

In [None]:
def Haversine(lat1, lon1, lat2, lon2, **kwarg):
    """
    source : https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
    'haversine' calculate the great-circle distance between two points
    shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points 
    (ignoring any hills they fly over, of course!).
    Haversine
    formula:    a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
    c = 2 ⋅ atan2( √a, √(1−a) )
    d = R ⋅ c
    where   φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km);
    note that angles need to be in radians to pass to trig functions!
    """
    R = 6371.0088
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) ** 2
    c = 2 * np.arctan2(a ** 0.5, (1-a) ** 0.5)
    d = R * c
    
    return d

In [None]:
def df_transform(df):

    df['distance'] = df.apply(lambda col: Haversine(col['pickup_latitude'], col['pickup_longitude'], col['dropoff_latitude'], col['dropoff_longitude']), axis=1)
    df['pickup_day_of_week'] = df['pickup_datetime'].dt.day_name().astype('category')
    df['pickup_month'] = df['pickup_datetime'].dt.month_name().astype('category')
    df['pickup_hour'] = df['pickup_datetime'].dt.hour.astype('category')
    df['pickup_timestamp'] = df['pickup_datetime'].dt.hour + df['pickup_datetime'].dt.minute / 12
    # df['vendor_id'] = df['vendor_id'].astype('category')
    df = df.drop(columns=['id', 'vendor_id', 'pickup_datetime'])
    
    return df

In [None]:
df = df_transform(df)

In [None]:
df.head()

In [None]:
train_set, test_set = train_test_split(
    df,
    test_size=.2,
    random_state=952
)

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix[LFV].sort_values(ascending=False)

In [None]:
train_X = train_set.drop(columns=[LFV])
train_y = train_set[LFV]

## Numeric

In [None]:
train_X_num = list(train_X.select_dtypes('number').columns)

In [None]:
num_pipeline = Pipeline([
    ('standardscaler', StandardScaler()),
])

## Text

In [None]:
train_X_cat = list(train_X.select_dtypes('category').columns)

In [None]:
cat_pipeline = Pipeline([
    ('ohe', OneHotEncoder()),
])

In [None]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, train_X_num),
    ('cat', cat_pipeline, train_X_cat)
])

In [None]:
train_X_prepared = full_pipeline.fit_transform(train_X)
train_y_log = np.log1p(train_y)

## Random forest

In [None]:
#forest_reg = RandomForestRegressor()
#forest_reg.fit(train_X_prepared, train_y_log)

## XGBoost

In [None]:
xgboost_reg = xgboost.XGBRegressor(eval_metric='rmse')
xgboost_reg.fit(train_X_prepared, train_y_log)

In [None]:
xgboost_reg_scores = cross_val_score(
    xgboost_reg,
    train_X_prepared,
    train_y_log,
    #scoring='mean_squared_error',
    # scoring='neg_mean_squared_error',
    cv=5
)

In [None]:
param_grid = [
    {
        'colsample_bytree': [.4],
        'gamma': [0],
        'learning_rate': [.07],  # np.arange(.04, .09, .01),
        'max_depth': [3],
        'min_child_weight': [1.5],  # np.arange(1, 2, .1),
        'n_estimators': [50, 100],
        'reg_alpha': [.75],  # np.arange(.6, .8, .1),
        'reg_lambda': [.45],  # np.arange(.2, .6, .1),
        'subsample': [.6]  # np.arange(.4, .6, .1)
    },
]


grid_search = GridSearchCV(
    xgboost_reg,
    param_grid,
    cv=5,
    return_train_score=True
)

grid_search.fit(train_X_prepared, train_y_log)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
xgb_model = grid_search.best_estimator_

xgb_model.fit(
    train_X_prepared,
    train_y_log
)

In [None]:
xgboost_reg_scores

In [None]:
df_test_raw = pd.read_csv(
    REP_DATA / 'test.csv',
    # index_col='id',
    parse_dates=['pickup_datetime'],
    dtype={'store_and_fwd_flag':'category'}
)

df_test = df_transform(df_test_raw)
df_test.head()

In [None]:
X_test_prepared = full_pipeline.transform(df_test)
X_test_predictions_log = xgb_model.predict(X_test_prepared)

X_test_predictions = np.expm1(X_test_predictions_log)

xgb_model_result = pd.concat([df_test_raw, pd.DataFrame(X_test_predictions, columns=[LFV])], axis=1)
xgb_model_result[['id', LFV]].to_csv('submission.csv', index=False)