In [15]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.linear_model.bayes
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import KFold, RepeatedKFold


In [16]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [17]:
#rel = train.corr()
#k = 200
#cols = rel.nlargest(k, 'target')['target'].index
cols = train.columns

In [18]:
print(cols)

Index(['id', 'target', '0', '1', '2', '3', '4', '5', '6', '7',
       ...
       '290', '291', '292', '293', '294', '295', '296', '297', '298', '299'],
      dtype='object', length=302)


In [19]:
x = train[[f for f in cols if f not in ['target']]]
y = train['target']
ids = test['id']
del test['id']

In [20]:
param = {'num_leaves': 120,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
pred = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x, y)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(x.iloc[trn_idx], y.iloc[trn_idx])
    val_data = lgb.Dataset(x.iloc[val_idx], y.iloc[val_idx])

    clf = lgb.train(param, 
                    trn_data, 
                    num_boost_round=20000, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 200, 
                    early_stopping_rounds = 1000)
    pred += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits

fold n°1
Training until validation scores don't improve for 1000 rounds.
[200]	training's l2: 0.103981	valid_1's l2: 0.196934
[400]	training's l2: 0.0523686	valid_1's l2: 0.188782
[600]	training's l2: 0.027871	valid_1's l2: 0.185426
[800]	training's l2: 0.0151719	valid_1's l2: 0.182213
[1000]	training's l2: 0.00857539	valid_1's l2: 0.180959
[1200]	training's l2: 0.00496318	valid_1's l2: 0.180668
[1400]	training's l2: 0.00298091	valid_1's l2: 0.179884
[1600]	training's l2: 0.00183749	valid_1's l2: 0.180508
[1800]	training's l2: 0.00117241	valid_1's l2: 0.180653
[2000]	training's l2: 0.000774604	valid_1's l2: 0.180789
[2200]	training's l2: 0.000524981	valid_1's l2: 0.180806
Early stopping, best iteration is:
[1372]	training's l2: 0.00319677	valid_1's l2: 0.179726
fold n°2
Training until validation scores don't improve for 1000 rounds.
[200]	training's l2: 0.10193	valid_1's l2: 0.222348
[400]	training's l2: 0.0516708	valid_1's l2: 0.213496
[600]	training's l2: 0.027347	valid_1's l2: 0.210

In [21]:
submit = pd.read_csv("sample_submission.csv")
submit['target'] = pred
submit.to_csv("simple.csv", index=None)