In [1]:
import numpy as np
import pandas as pd
import gc

# Modeling
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
TRAIN_DATA = 'tmp/taxi-train-v8-Baseline'
TEST_DATA = 'tmp/taxi-test-v8-Baseline'
TEST_CSV = 'data/nyc-taxi/test.csv'

# Load data

In [3]:
train_df = pd.read_feather(TRAIN_DATA)
test_df = pd.read_feather(TEST_DATA)
test_key = pd.read_csv(TEST_CSV, usecols=['key'])

In [None]:
train_df.shape, test_df.shape, test_key.shape

In [None]:
y_train = train_df['fare_amount'].values
X_train = train_df.drop('fare_amount', axis=1)

In [None]:
X_train.shape, y_train.shape

In [None]:
trainshape = X_train.shape
testshape = test_df.shape

In [None]:
print("Does Train feature equal test feature?: ", all(X_train.columns == test_df.columns))

## Modeling with LGBM

In [None]:
dtrain = lgb.Dataset(X_train, label=y_train, free_raw_data=False)

In [None]:
print("Light Gradient Boosting Regressor: ")

lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse'
                }

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=1)
fold_preds = np.zeros(testshape[0])
oof_preds = np.zeros(trainshape[0])
dtrain.construct()

In [None]:
%%time
for trn_idx, val_idx in folds.split(X_train):
    
    clf = lgb.train(
        params=lgbm_params,
        train_set=dtrain.subset(trn_idx),
        valid_sets=dtrain.subset(val_idx),
        num_boost_round=3500, 
        early_stopping_rounds=125,
        verbose_eval=500
    )
    oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
    fold_preds += clf.predict(test_df) / folds.n_splits
    
    print(mean_squared_error(y_train.iloc[val_idx], oof_preds[val_idx]) ** .5)

In [None]:
lgsub = pd.DataFrame(fold_preds,columns=["fare_amount"],index=test_key)
lgsub.to_csv("lgsub.csv",index=True,header=True)
lgsub.head()