In [17]:
import numpy as np
import lightgbm as lgb
import pandas as pd
import os
from sklearn.metrics import mean_squared_error

In [18]:
parent_folder="../data"
reg_train=os.path.join(parent_folder, "regression.train.txt")
reg_test=os.path.join(parent_folder,"regression.test.txt")


In [19]:
os.listdir("../data/")

['regression.test.txt', 'regression.train.txt']

In [20]:
df_train = pd.read_csv(reg_train, header=None, sep='\t')
df_test = pd.read_csv(reg_test, header=None, sep='\t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

In [21]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,0.869,-0.635,0.226,0.327,-0.69,0.754,-0.249,-1.092,0.0,...,-0.01,-0.046,3.102,1.354,0.98,0.978,0.92,0.722,0.989,0.877
1,1,0.908,0.329,0.359,1.498,-0.313,1.096,-0.558,-1.588,2.173,...,-1.139,-0.001,0.0,0.302,0.833,0.986,0.978,0.78,0.992,0.798
2,1,0.799,1.471,-1.636,0.454,0.426,1.105,1.282,1.382,0.0,...,1.129,0.9,0.0,0.91,1.108,0.986,0.951,0.803,0.866,0.78
3,0,1.344,-0.877,0.936,1.992,0.882,1.786,-1.647,-0.942,0.0,...,-0.678,-1.36,0.0,0.947,1.029,0.999,0.728,0.869,1.027,0.958
4,1,1.105,0.321,1.522,0.883,-1.205,0.681,-1.07,-0.922,0.0,...,-0.374,0.113,0.0,0.756,1.361,0.987,0.838,1.133,0.872,0.808


In [22]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)


In [23]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [24]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

print('Saving model...')
# save model to file
gbm.save_model('simple_example_model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Starting training...
[1]	valid_0's l2: 0.243898	valid_0's l1: 0.492841
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 0.240605	valid_0's l1: 0.489327
[3]	valid_0's l2: 0.236472	valid_0's l1: 0.484931
[4]	valid_0's l2: 0.232586	valid_0's l1: 0.480567
[5]	valid_0's l2: 0.22865	valid_0's l1: 0.475965
[6]	valid_0's l2: 0.226187	valid_0's l1: 0.472861
[7]	valid_0's l2: 0.223738	valid_0's l1: 0.469847
[8]	valid_0's l2: 0.221012	valid_0's l1: 0.466258
[9]	valid_0's l2: 0.218429	valid_0's l1: 0.462751
[10]	valid_0's l2: 0.215505	valid_0's l1: 0.458755
[11]	valid_0's l2: 0.213027	valid_0's l1: 0.455252
[12]	valid_0's l2: 0.210809	valid_0's l1: 0.452051
[13]	valid_0's l2: 0.208612	valid_0's l1: 0.448764
[14]	valid_0's l2: 0.207468	valid_0's l1: 0.446667
[15]	valid_0's l2: 0.206009	valid_0's l1: 0.444211
[16]	valid_0's l2: 0.20465	valid_0's l1: 0.44186
[17]	valid_0's l2: 0.202489	valid_0's l1: 0.438508
[18]	valid_0's l2: 0.200668	valid_0's l1: 0.435919
[19]	valid_0'

In [25]:
y_pred

array([0.6591194 , 0.52188659, 0.38268875, 0.5110434 , 0.38197134,
       0.34018169, 0.41396358, 0.39943774, 0.6229651 , 0.47306251,
       0.61847223, 0.70022291, 0.71058276, 0.69914022, 0.42314069,
       0.66783381, 0.43258639, 0.56091826, 0.57046519, 0.56075766,
       0.63319872, 0.63394932, 0.55533024, 0.53623368, 0.41009091,
       0.57012794, 0.6020171 , 0.66496297, 0.55957183, 0.62582147,
       0.57565458, 0.52932965, 0.52485391, 0.53569855, 0.51980793,
       0.44330542, 0.36508028, 0.38498675, 0.54263776, 0.68376726,
       0.23888025, 0.5774767 , 0.42391562, 0.51560301, 0.37661448,
       0.26882572, 0.50617193, 0.53560405, 0.71074658, 0.4956262 ,
       0.7632709 , 0.4300438 , 0.23701538, 0.70333705, 0.69718315,
       0.54938395, 0.36687156, 0.45779262, 0.52895726, 0.6150412 ,
       0.48265423, 0.72754265, 0.7017657 , 0.45027717, 0.71178043,
       0.37888004, 0.57809895, 0.29833862, 0.63017501, 0.66458151,
       0.45961202, 0.68913084, 0.74013178, 0.36842849, 0.73434

In [26]:
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('The mse of prediction is:', mean_squared_error(y_test, y_pred))

The rmse of prediction is: 0.44512434910807497
The mse of prediction is: 0.19813568616888738
