In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import lightgbm as lgbm
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

In [4]:
TRAIN_FEATHER = 'tmp/taxi-train-v10-Airport'
TEST_FEATHER = 'tmp/taxi-test-v10-Airport'
SUBM_CSV = 'data/nyc-taxi/sample_submission.csv'
NSAMPLES = 10_000_000
SEED = 1000
ITERATIONS = 10

In [5]:
train_df = pd.read_feather(TRAIN_FEATHER)
test_df = pd.read_feather(TEST_FEATHER)
subm_df = pd.read_csv(SUBM_CSV, index_col='key')

In [6]:
train_df.shape, test_df.shape

((54075311, 24), (9914, 23))

# LGBM

In [7]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.02,       # 0.1
    'feature_fraction': 0.7,     # 1.0
    'bagging_fraction': 0.7,     # 1.0
    'bagging_freq': 5,           # 0
#     'max_bin': 155,              # 255
#     'num_leaves': 31,            # 31
}

In [8]:
best_val_lst = []
test_pred_lst = []

for i in range(ITERATIONS):
    i += 1
    print('Iteration', i)

    train = train_df.sample(NSAMPLES, random_state=SEED+i)
    y = train.fare_amount.copy()
    train = train[test_df.columns]

    x_train, x_val, y_train, y_val = train_test_split(train, y, random_state=SEED+i, test_size=0.01)

    dtrain = lgbm.Dataset(x_train, label=y_train, free_raw_data=False)
    dval = lgbm.Dataset(x_val, label=y_val, free_raw_data=False)
    dtrain.construct()
    dval.construct()

    model = lgbm.train(
        params=lgbm_params,
        train_set=dtrain,
        valid_sets=dval,
        valid_names='validation',
        num_boost_round=5000, 
        early_stopping_rounds=125,
        verbose_eval=500
    )

    best_val_lst.append(model.best_score['validation']['rmse'])
    test_pred_lst.append(model.predict(test_df))

Iteration 1
Training until validation scores don't improve for 125 rounds.
[500]	validation's rmse: 3.47092
[1000]	validation's rmse: 3.37713
[1500]	validation's rmse: 3.33725
[2000]	validation's rmse: 3.314
[2500]	validation's rmse: 3.29586
[3000]	validation's rmse: 3.28208
[3500]	validation's rmse: 3.27251
[4000]	validation's rmse: 3.26564
[4500]	validation's rmse: 3.25881
[5000]	validation's rmse: 3.25362
Did not meet early stopping. Best iteration is:
[5000]	validation's rmse: 3.25362
Iteration 2
Training until validation scores don't improve for 125 rounds.
[500]	validation's rmse: 3.66164
[1000]	validation's rmse: 3.56765
[1500]	validation's rmse: 3.52476
[2000]	validation's rmse: 3.49678
[2500]	validation's rmse: 3.47904
[3000]	validation's rmse: 3.46675
[3500]	validation's rmse: 3.45711
[4000]	validation's rmse: 3.44927
[4500]	validation's rmse: 3.44443
[5000]	validation's rmse: 3.43834
Did not meet early stopping. Best iteration is:
[4999]	validation's rmse: 3.4383
Iteration 3

KeyboardInterrupt: 

In [9]:
np.mean(best_val_lst)

3.3074136727540955

In [10]:
best_val_lst

[3.2536195071113494,
 3.4383047240633853,
 3.474767676112586,
 3.2142679363149766,
 3.156108520168179]

In [12]:
scaled_weights =  np.sum(best_val_lst) / best_val_lst
scaled_weights = scaled_weights / np.sum(scaled_weights)
scaled_weights

array([0.2030135 , 0.19210883, 0.19009291, 0.20549895, 0.2092858 ])

In [14]:
scaled_weights = scaled_weights.reshape(5,-1)
scaled_weights

array([[0.2030135 ],
       [0.19210883],
       [0.19009291],
       [0.20549895],
       [0.2092858 ]])

In [15]:
y_preds1 = np.sum(np.vstack([test_pred_lst])*scaled_weights, axis=0)
y_preds1

array([10.24107531, 11.46199007,  4.43339207, ..., 54.29173385,
       20.10354866,  6.7579646 ])

In [None]:
y_preds2 = np.mean(np.vstack([test_pred_lst]),axis=0)

In [None]:
y_preds2

In [None]:
def weighted_mean(preds, weights):
    scaled_weights = weights/ np.sum(weights)
    np.sum(np.vstack([preds])*scaled_weights, axis=0)

In [None]:
y_preds1 = weighted_mean(test_pred_lst,best_val_lst)

In [17]:
subm_df['fare_amount'] = y_preds1
subm_df.to_csv('submissions/submission_weighted_mean.csv')
subm_df.head()

Unnamed: 0_level_0,fare_amount
key,Unnamed: 1_level_1
2015-01-27 13:08:24.0000002,10.241075
2015-01-27 13:08:24.0000003,11.46199
2011-10-08 11:53:44.0000002,4.433392
2012-12-01 21:12:12.0000002,8.493902
2012-12-01 21:12:12.0000003,15.689723
