In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
print('Load training data...')
df_x_train = pd.read_csv('X_train.csv', header=0, index_col = 0)
df_y_train = pd.read_csv('y_train.csv', header=0, index_col = 0)

Load training data...


# Explore the dataset

In [3]:
df_x_train.head()

Unnamed: 0_level_0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x877,x878,x879,x880,x881,x882,x883,x884,x885,x886
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,7077.537454,926681.1,1076365.0,1029.624479,105.87744,180486.413011,1084.60481,296933.955135,104840.111208,10736.380544,...,392180300000.0,1040522.0,-437724.358877,957.470031,988877.2,3830.501971,98.622927,1771.079992,10008.297422,65052.593208
1.0,4406.423818,1172821.0,1052467.0,1000.548328,106.827932,154325.607476,1034.480237,337446.411077,107789.40914,10502.289317,...,36466010000.0,899662.9,-486231.980813,1127.268384,837459.6,1981.239448,97.369765,843.078502,10046.004997,65052.623467
2.0,4460.87869,938366.3,1040878.0,1096.946907,104.058397,197813.871177,,373983.030405,109480.853246,10531.423706,...,91471760000.0,957564.9,,976.904261,921070.6,2334.782218,113.751711,1134.021199,10307.683079,
3.0,7152.388016,923185.7,1009414.0,1044.520384,101.089811,182569.308398,,339951.730003,106597.409809,10070.560056,...,242822600000.0,,-497230.891433,912.933479,1117182.0,3711.834265,98.080257,1553.015493,10952.963586,65052.576387
4.0,6531.930242,1133718.0,1032579.0,1032.803345,102.608707,200161.011648,,332425.590129,107815.955426,10889.035954,...,127373400000.0,1059684.0,-485361.519459,1001.804829,952376.4,3153.167755,100.671564,1271.047509,10048.902807,65052.535207


In [4]:
print("These columns are dummy!")
df_x_train[["x493", "x523", "x731", "x772"]]

These columns are dummy!


Unnamed: 0_level_0,x493,x523,x731,x772
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.0,0.0,0.0,0.0
1.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0
4.0,0.0,,0.0,0.0
5.0,0.0,0.0,0.0,0.0
6.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0
9.0,0.0,0.0,0.0,0.0


In [5]:
df_y_train.head()

Unnamed: 0_level_0,y
id,Unnamed: 1_level_1
0.0,75.0
1.0,76.0
2.0,74.0
3.0,70.0
4.0,74.0


# Feature Selection

In [6]:
from numpy import isnan
def count_nan(l):
    return len([1 for x in l.values if isnan(x[0])])
print("number of nan's in each column:")
for col in df_x_train:
    print(col, count_nan(df_x_train[[col]]))

number of nan's in each column:
x0 90
x1 72
x2 80
x3 89
x4 79
x5 86
x6 71
x7 90
x8 90
x9 93
x10 85
x11 93
x12 78
x13 100
x14 92
x15 94
x16 77
x17 75
x18 91
x19 82
x20 89
x21 99
x22 96
x23 79
x24 85
x25 86
x26 82
x27 96
x28 87
x29 98
x30 95
x31 88
x32 78
x33 83
x34 91
x35 83
x36 104
x37 80
x38 77
x39 92
x40 92
x41 79
x42 79
x43 91
x44 79
x45 96
x46 90
x47 77
x48 72
x49 85
x50 98
x51 78
x52 82
x53 68
x54 78
x55 83
x56 78
x57 102
x58 69
x59 97
x60 76
x61 97
x62 84
x63 96
x64 97
x65 92
x66 72
x67 93
x68 95
x69 83
x70 76
x71 90
x72 83
x73 102
x74 91
x75 92
x76 100
x77 106
x78 81
x79 103
x80 93
x81 78
x82 85
x83 94
x84 82
x85 93
x86 98
x87 98
x88 86
x89 86
x90 85
x91 74
x92 80
x93 90
x94 66
x95 96
x96 83
x97 104
x98 87
x99 88
x100 76
x101 88
x102 84
x103 78
x104 79
x105 80
x106 81
x107 95
x108 98
x109 78
x110 89
x111 82
x112 85
x113 81
x114 113
x115 98
x116 92
x117 80
x118 89
x119 86
x120 77
x121 78
x122 100
x123 91
x124 90
x125 67
x126 83
x127 84
x128 93
x129 92
x130 83
x131 75
x132 101
x13

In [7]:
from scipy.stats import pearsonr
df_filled = df_x_train.fillna(df_x_train.mean()).copy()
features = df_filled.iloc[:,1:].columns.tolist()
labels = df_y_train["y"].values
correlations = {}
for f in features:
    data_temp = df_filled[[f]]
    x1 = data_temp[f].values
    key = f
    correlations[key] = pearsonr(x1,labels)[0]
print("Correlations of each feature against age")
data_correlations = pd.DataFrame(correlations, index=['correlation']).T
indices_desc = data_correlations['correlation'].abs().sort_values(ascending=False).index
data_correlations.loc[indices_desc][abs(data_correlations['correlation']) >= 0.2]

Correlations of each feature against age


  r = r_num / r_den
  


Unnamed: 0,correlation
x746,0.455891
x82,0.451206
x685,-0.450200
x333,0.436991
x391,0.432717
x722,0.432330
x882,-0.429216
x751,-0.415688
x291,-0.409631
x96,0.408898


In [322]:
selected_features = data_correlations['correlation'].abs().sort_values(ascending=False).loc[abs(data_correlations['correlation']) >= 0.1].index
selected_features

Index(['x746', 'x82', 'x685', 'x333', 'x391', 'x722', 'x882', 'x751', 'x291',
       'x96',
       ...
       'x361', 'x499', 'x416', 'x725', 'x15', 'x39', 'x747', 'x624', 'x829',
       'x789'],
      dtype='object', length=203)

# Training LightGBM

In [323]:
X = df_x_train[selected_features].values
X

array([[1.51043839e+03, 7.92533090e+02, 7.57225976e+04, ...,
        2.58807884e+00,            nan, 1.40765036e+06],
       [1.71367954e+03, 2.04384761e+03, 3.45676645e+04, ...,
        2.05220010e+00, 1.40100578e+03, 1.51311443e+06],
       [1.80266925e+03, 1.84017550e+03, 1.02509913e+05, ...,
        2.17282790e+00, 2.23203176e+03, 1.56108941e+06],
       ...,
       [3.64095010e+03, 3.07020894e+03, 5.64951903e+04, ...,
        2.61239274e+00, 2.29902917e+03, 1.63645417e+06],
       [2.08526660e+03, 9.55972468e+02, 1.02771633e+05, ...,
        2.58108680e+00, 2.85602196e+03, 1.63319893e+06],
       [8.51175900e+02, 2.26047823e+02, 1.15412762e+05, ...,
        2.10652134e+00,            nan, 1.40965347e+06]])

In [324]:
y = df_y_train['y'].values
y

array([75., 76., 74., ..., 78., 78., 56.])

In [325]:
print('Splitting into training and validation dataset')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2) # , random_state = 19960503)

Splitting into training and validation dataset


In [326]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [327]:
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    #'metric': {'l1', 'l2'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

def custom_r2(preds, train_data):
    labels = train_data.get_label()
    return 'r2', r2_score(labels, preds), True

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                feval=custom_r2,
                valid_sets={lgb_train, lgb_eval},
                early_stopping_rounds=20)

Start training...
[1]	training's l2: 92.6531	training's r2: 0.055962	valid_0's l2: 97.3166	valid_0's r2: 0.0381836
Training until validation scores don't improve for 20 rounds.
[2]	training's l2: 87.4497	training's r2: 0.108979	valid_0's l2: 93.1089	valid_0's r2: 0.0797694
[3]	training's l2: 82.6618	training's r2: 0.157763	valid_0's l2: 89.6966	valid_0's r2: 0.113495
[4]	training's l2: 78.1701	training's r2: 0.203529	valid_0's l2: 86.5238	valid_0's r2: 0.144853
[5]	training's l2: 74.126	training's r2: 0.244734	valid_0's l2: 83.7886	valid_0's r2: 0.171886
[6]	training's l2: 70.3712	training's r2: 0.282992	valid_0's l2: 80.9353	valid_0's r2: 0.200086
[7]	training's l2: 67.073	training's r2: 0.316596	valid_0's l2: 78.1	valid_0's r2: 0.228109
[8]	training's l2: 63.8559	training's r2: 0.349376	valid_0's l2: 76.0091	valid_0's r2: 0.248774
[9]	training's l2: 60.9439	training's r2: 0.379045	valid_0's l2: 74.492	valid_0's r2: 0.263768
[10]	training's l2: 58.2648	training's r2: 0.406344	valid_0'

[91]	training's l2: 7.50602	training's r2: 0.923522	valid_0's l2: 43.2576	valid_0's r2: 0.572469
[92]	training's l2: 7.39312	training's r2: 0.924672	valid_0's l2: 43.3311	valid_0's r2: 0.571743
[93]	training's l2: 7.28125	training's r2: 0.925812	valid_0's l2: 43.406	valid_0's r2: 0.571002
[94]	training's l2: 7.17248	training's r2: 0.92692	valid_0's l2: 43.3514	valid_0's r2: 0.571541
[95]	training's l2: 7.08328	training's r2: 0.927829	valid_0's l2: 43.4466	valid_0's r2: 0.570601
[96]	training's l2: 6.99366	training's r2: 0.928742	valid_0's l2: 43.42	valid_0's r2: 0.570864
[97]	training's l2: 6.89805	training's r2: 0.929716	valid_0's l2: 43.3855	valid_0's r2: 0.571205
[98]	training's l2: 6.80529	training's r2: 0.930661	valid_0's l2: 43.3933	valid_0's r2: 0.571128
[99]	training's l2: 6.73	training's r2: 0.931428	valid_0's l2: 43.4157	valid_0's r2: 0.570906
[100]	training's l2: 6.67058	training's r2: 0.932034	valid_0's l2: 43.3228	valid_0's r2: 0.571825
[101]	training's l2: 6.57019	trainin

# Predict and output submission

In [328]:
print('Load testing data...')
df_x_test = pd.read_csv('X_test.csv', header=0, index_col = 0)
X_test = df_x_test[selected_features].values
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred

Load testing data...


array([73.95905989, 71.07851743, 73.50576307, 58.14165241, 73.408844  ,
       65.78543127, 78.56009763, 66.40367125, 74.74709526, 64.69820868,
       77.29132841, 61.57130182, 69.83793383, 72.25170893, 75.64706206,
       74.04871991, 61.23418049, 74.0606731 , 72.03516297, 68.31445902,
       64.36151674, 63.57373242, 78.22729283, 72.39271542, 69.7098291 ,
       66.08867651, 60.89273874, 79.70099579, 77.67479169, 74.21764574,
       53.59425283, 72.01649804, 62.68205172, 63.72751212, 65.15659754,
       66.98520607, 77.62733706, 58.79106664, 66.9327608 , 71.54270884,
       76.25857437, 76.54439783, 62.94351222, 70.44438894, 69.00643244,
       63.89062846, 80.09141316, 75.74538033, 76.38627439, 77.57410333,
       63.38218739, 61.33966117, 59.32463539, 63.08320001, 56.46820588,
       65.22223101, 82.31690896, 57.13400309, 54.06622226, 75.2455487 ,
       76.00242895, 75.1337097 , 75.88531324, 61.46634293, 67.25121429,
       68.45077976, 78.41629315, 77.47698102, 60.41465375, 61.40

In [255]:
f = open("submission.csv", "w")
f.write("id,y\n")
for i,x in enumerate(y_pred):
    f.write("{},{}\n".format(i,x))
f.close()

# Randomly find best result

In [None]:
best = 0
for i in range(100):
    print('Splitting into training and validation dataset')
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.15) # , random_state = 19960503)
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    # specify your configurations as a dict
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        #'metric': {'l1', 'l2'},
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    def custom_r2(preds, train_data):
        labels = train_data.get_label()
        return 'r2', r2_score(labels, preds), True

    print('Start training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=500,
                    feval=custom_r2,
                    valid_sets={lgb_train, lgb_eval},
                    early_stopping_rounds=20)
    y_val_p = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    
    result = r2_score(y_val, y_val_p)
    if result > best:
        print("New Best {}".format(result))
        best = result
        y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
        f = open("submission{0:.4f}.csv".format(best), "w")
        f.write("id,y\n")
        for i,x in enumerate(y_pred):
            f.write("{},{}\n".format(i,x))
        f.close()