# Imports

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [3]:
import lightgbm as lgbm
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

In [4]:
PATH = 'data/nyc-taxi/'
CSV_PATH = 'submissions/LGBM/'

In [5]:
train_df = pd.read_feather('tmp/taxi-train-clean')
test_df = pd.read_feather('tmp/taxi-test-clean')
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

In [6]:
train_df.shape, test_df.shape

((19503643, 43), (9914, 42))

In [7]:
%%time
X_train, X_valid, y_train, y_valid = train_test_split(train_df.drop('fare_amount', axis=1), train_df['fare_amount'], test_size=0.1, random_state=111)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

CPU times: user 21 s, sys: 3.71 s, total: 24.7 s
Wall time: 24.7 s


In [8]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 15,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0
    }

In [9]:
matrix_train = lgbm.Dataset(X_train, y_train)
matrix_test = lgbm.Dataset(X_valid, y_valid)

model=lgbm.train(params=params,
                train_set=matrix_train,
                num_boost_round=50_000, 
                early_stopping_rounds=500,
                verbose_eval=100,
                valid_sets=matrix_test)

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's rmse: 3.65044
[200]	valid_0's rmse: 3.55172
[300]	valid_0's rmse: 3.51059
[400]	valid_0's rmse: 3.48816
[500]	valid_0's rmse: 3.4725
[600]	valid_0's rmse: 3.46117
[700]	valid_0's rmse: 3.45234
[800]	valid_0's rmse: 3.44496
[900]	valid_0's rmse: 3.43943
[1000]	valid_0's rmse: 3.43418
[1100]	valid_0's rmse: 3.42979
[1200]	valid_0's rmse: 3.42655
[1300]	valid_0's rmse: 3.42324
[1400]	valid_0's rmse: 3.42032
[1500]	valid_0's rmse: 3.41783
[1600]	valid_0's rmse: 3.41611
[1700]	valid_0's rmse: 3.41446
[1800]	valid_0's rmse: 3.41193
[1900]	valid_0's rmse: 3.41058
[2000]	valid_0's rmse: 3.40896
[2100]	valid_0's rmse: 3.40758
[2200]	valid_0's rmse: 3.40666
[2300]	valid_0's rmse: 3.40563
[2400]	valid_0's rmse: 3.40456
[2500]	valid_0's rmse: 3.40342
[2600]	valid_0's rmse: 3.40212
[2700]	valid_0's rmse: 3.40147
[2800]	valid_0's rmse: 3.40076
[2900]	valid_0's rmse: 3.40022
[3000]	valid_0's rmse: 3.39931
[3100]	valid_0's 

In [12]:
prediction = model.predict(test_df, num_iteration = model.best_iteration) 
 
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': prediction},
    columns = ['key', 'fare_amount'])

submission.to_csv(f'{CSV_PATH}lgbm.csv', index = False)

In [13]:
model.save_model('tmp/lgbm_model.txt')

<lightgbm.basic.Booster at 0x7fbe04c79358>