In [5]:
import csv
import datetime
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import xgboost as xgb

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, r2_score

## XGBoost Model

In [6]:
def build_xgboost_model(X_train, y_train, params):
    """
    Return: xgboost model
    """
    model = xgb.XGBRegressor(
        learning_rate=params['learning_rate'],
        n_estimators=params['n_estimators'],
        booster=params['booster'],
        objective=params['objective'],
        n_jobs=-1,
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        random_state=0,
        silent=params['silent'],
        max_depth=params['max_depth'],
        gamma=params['gamma'],
        min_child_weight=params['min_child_weight'],
        reg_alpha=params['reg_alpha']
    )

    return model.fit(X_train, y_train)

In [7]:
def prediction(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

In [8]:
def get_mse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

## Predict Trip Duration

In [9]:
def read_from_csv(path):
    df = pd.read_csv(path, index_col=0)
    return df

In [10]:
train_path = './data/train.csv'
test_path = './data/test.csv'

X_train, X_test = read_from_csv(train_path, ), read_from_csv(test_path)
len(X_train), len(X_test)

(956868, 239217)

In [11]:
y_train, y_test = pd.DataFrame(X_train['duration']), pd.DataFrame(X_test['duration'])

X_train.drop(columns=['duration'], inplace=True)
X_test.drop(columns=['duration'], inplace=True)

# drop the datetime type columns
X_train.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime'], inplace=True)
X_test.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime'], inplace=True)

In [12]:
params = { 
    'booster': 'gbtree',
    'objective':'reg:linear',
    'learning_rate': 0.2,
    'n_estimators': 200,
    'objective': 'reg:linear',  
    'gamma': 0.3,                  # control pruning
    'max_depth':5 ,               
    'lambda': 2,                   # L2 parameter
    'subsample': 0.8,              # random sample 
    'colsample_bytree': 0.7,       # col sample when generate tree
    'min_child_weight': 1,
    'silent': 0,
    'reg_alpha': 0
}

In [13]:
model = build_xgboost_model(X_train, y_train, params)



In [14]:
# Train
y_pred = prediction(model, X_train)

train_loss = get_mse(y_train, y_pred)
print("Train Loss: {}".format(train_loss))

Train Loss: 2.1126842109775263


In [15]:
# Test
y_pred = prediction(model, X_test)

test_loss = get_mse(y_test, y_pred)
print("Test Loss: {}".format(test_loss))

Test Loss: 2.1530154678916693
