# Imports

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [3]:
import lightgbm as lgbm
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

In [4]:
PATH = 'data/nyc-taxi/'
CSV_PATH = 'submissions/LGBM/'

In [5]:
train_df = pd.read_feather('tmp/taxi-train-clean')
test_df = pd.read_feather('tmp/taxi-test-clean')
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

In [6]:
train_df.shape, test_df.shape

((54062903, 34), (9914, 33))

In [7]:
%%time
X_train, X_valid, y_train, y_valid = train_test_split(train_df.drop('fare_amount', axis=1), train_df['fare_amount'], test_size=0.05, random_state=111)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

CPU times: user 55.5 s, sys: 13 s, total: 1min 8s
Wall time: 1min 8s


In [8]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 16,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0
    }


def LGBMmodel(X_train, X_valid, y_train, y_valid, params):
    matrix_train = lgbm.Dataset(X_train, y_train)
    matrix_test = lgbm.Dataset(X_valid, y_valid)
    model=lgbm.train(params=params,
                    train_set=matrix_train,
                    num_boost_round=50_000, 
                    early_stopping_rounds=500,
                    verbose_eval=100,
                    valid_sets=matrix_test)
    return model

In [9]:
# Training RMSE best iteration is 3.25
model = LGBMmodel(X_train, X_valid, y_train, y_valid, params)

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's rmse: 3.74722
[200]	valid_0's rmse: 3.64835
[300]	valid_0's rmse: 3.60687
[400]	valid_0's rmse: 3.58092
[500]	valid_0's rmse: 3.56401
[600]	valid_0's rmse: 3.55171
[700]	valid_0's rmse: 3.54187
[800]	valid_0's rmse: 3.53397
[900]	valid_0's rmse: 3.52714
[1000]	valid_0's rmse: 3.52178
[1100]	valid_0's rmse: 3.51711
[1200]	valid_0's rmse: 3.51336
[1300]	valid_0's rmse: 3.50958
[1400]	valid_0's rmse: 3.50608
[1500]	valid_0's rmse: 3.50337
[1600]	valid_0's rmse: 3.50083
[1700]	valid_0's rmse: 3.49818
[1800]	valid_0's rmse: 3.49604
[1900]	valid_0's rmse: 3.49401
[2000]	valid_0's rmse: 3.49198
[2100]	valid_0's rmse: 3.49021
[2200]	valid_0's rmse: 3.48861
[2300]	valid_0's rmse: 3.48708
[2400]	valid_0's rmse: 3.48595
[2500]	valid_0's rmse: 3.4845
[2600]	valid_0's rmse: 3.48317
[2700]	valid_0's rmse: 3.48201
[2800]	valid_0's rmse: 3.48074
[2900]	valid_0's rmse: 3.47982
[3000]	valid_0's rmse: 3.47867
[3100]	valid_0's 

In [10]:
prediction = model.predict(test_df, num_iteration = model.best_iteration) 
 
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': prediction},
    columns = ['key', 'fare_amount'])

submission.to_csv(f'{CSV_PATH}lgbm.csv', index = False)

In [11]:
model.save_model('tmp/lgbm_model.txt')

<lightgbm.basic.Booster at 0x7fac2f5fdcf8>