In [1]:
## Importing libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings('ignore')

SEED = 42

Mounted at /content/drive


In [None]:
## Importing data

df_train = pd.read_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/train_fe1_sum_lgbm.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Data Science Project - Team D/data/fe1-sum/test_fe1_sum_lgbm.csv")

In [None]:
## Oversampling 

def oversampling(x, y, t=2):
    
    xs, xn = [], []
    
    for i in range(t // 2):
        mask = y == 0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        featnum = x1.shape[1] // 200 - 1

        for c in range(200):
            np.random.shuffle(ids)
            x1[:, [c] + [200 + featnum * c + idc for idc in range(featnum)]] = x1[ids][:, [c] + [200 + featnum * c + idc for idc in range(featnum)]]
        xn.append(x1)
    
    for i in range(t):
        mask = y > 0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        featnum = x1.shape[1] // 200 - 1
        
        for c in range(200):
            np.random.shuffle(ids)
            x1[:, [c] + [200 + featnum * c + idc for idc in range(1)]] = x1[ids][:, [c] + [200 + featnum * c + idc for idc in range(1)]]
        xs.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x, xs, xn])
    y = np.concatenate([y, ys, yn])
    
    return x, y

In [None]:
gbdt_param1 = {
    # Core Parameters
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'feature_fraction': 1.0,
    'lambda_l1': 0,
    
    # Side Parameters
    'num_threads': 8,
    'seed': SEED,
    'tree_learner': 'serial',
    'objective': 'binary',
    'boosting': 'gbdt',
    'bagging_seed': SEED,
    'verbosity ': 1,
    'boost_from_average': False,
    'metric': 'auc',
}

gbdt_param2 = {
    # Core Parameters
    'learning_rate': 0.1,
    'num_leaves': 15,
    'max_depth': -1,
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'feature_fraction': 0.05,
    'lambda_l1': 1.,
    
    # Side Parameters
    'num_threads': 8,
    'seed': SEED,
    'tree_learner': 'serial',
    'objective': 'binary',
    'boosting': 'gbdt',
    'bagging_seed': SEED,
    'verbosity ': 1,
    'boost_from_average': False,
    'metric': 'auc',
}

gbdt_param3 = {
    # Core Parameters
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 20,  
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'feature_fraction': 0.5,
    'lambda_l1': 0,
    
    # Side Parameters
    'num_threads': 8,
    'seed': SEED,
    'tree_learner': 'serial',
    'objective': 'binary',
    'boosting': 'gbdt',
    'bagging_seed': SEED,
    'verbosity ': 1,
    'boost_from_average': False,
    'metric': 'auc',
}

gbdt_param4 = {
    # Core Parameters
    'learning_rate': 0.01,
    'num_leaves': 15,
    'max_depth': -1,
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10,  
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'feature_fraction': 0.5,
    'lambda_l1': 1.,
    
    # Side Parameters
    'num_threads': 8,
    'seed': SEED,
    'tree_learner': 'serial',
    'objective': 'binary',
    'boosting': 'gbdt',
    'bagging_seed': SEED,
    'verbosity ': 1,
    'boost_from_average': False,
    'metric': 'auc',
}

gbdt_param5 = {
    # Core Parameters
    'learning_rate': 0.01,
    'num_leaves': 15,
    'max_depth': -1,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 10,  
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'feature_fraction': 0.05,
    'lambda_l1': 1.,
    
    # Side Parameters
    'num_threads': 8,
    'seed': SEED,
    'tree_learner': 'serial',
    'objective': 'binary',
    'boosting': 'gbdt',
    'bagging_seed': SEED,
    'verbosity ': 1,
    'boost_from_average': False,
    'metric': 'auc',
}

In [None]:
predictors = df_train.columns.tolist()[2:]
X_test = df_test[predictors]

n_splits = 5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

predictions = df_test[['ID_code']]

In [None]:
# Parameter 1

for fold, (train_ind, val_ind) in enumerate(skf.split(df_train, df_train.target.values)):
    
    X_train, y_train = df_train.iloc[train_ind][predictors], df_train.iloc[train_ind]['target']
    X_valid, y_valid = df_train.iloc[val_ind][predictors], df_train.iloc[val_ind]['target']

    N = 1
    p_valid, yp = 0, 0
        
    for i in range(N):
        print('\nFold {} - N {}'.format(fold + 1, i + 1))
        
        X_t, y_t = oversampling(X_train.values, y_train.values)
        weights = np.array([0.8] * X_t.shape[0])
        weights[:X_train.shape[0]] = 1.0
        print('Shape of X_train after augment: {}\nShape of y_train after augment: {}'.format(X_t.shape, y_t.shape))
        
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t, weight=weights)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        
        lgb_clf = lgb.train(gbdt_param1, trn_data, 100000, valid_sets=[trn_data, val_data], early_stopping_rounds=5000, verbose_eval=1000, evals_result=evals_result)
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
        

    predictions['fold{}'.format(fold + 1)] = yp / N

predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']

sub_df.to_csv("/content/drive/MyDrive/Data Science Project - Team D/submission/submission_parameter1.csv", index=False)


Fold 1 - N 1
Shape of X_train after augment: (336079, 925)
Shape of y_train after augment: (336079,)
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.999981	valid_1's auc: 0.914691
[2000]	training's auc: 1	valid_1's auc: 0.91344
[3000]	training's auc: 1	valid_1's auc: 0.91307
[4000]	training's auc: 1	valid_1's auc: 0.91287
[5000]	training's auc: 1	valid_1's auc: 0.913039
Early stopping, best iteration is:
[648]	training's auc: 0.999311	valid_1's auc: 0.915438

Fold 2 - N 1
Shape of X_train after augment: (336079, 925)
Shape of y_train after augment: (336079,)
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.999983	valid_1's auc: 0.914685
[2000]	training's auc: 1	valid_1's auc: 0.913819
[3000]	training's auc: 1	valid_1's auc: 0.913505
[4000]	training's auc: 1	valid_1's auc: 0.912914
[5000]	training's auc: 1	valid_1's auc: 0.913342
Early stopping, best iteration is:
[574]	training's auc: 0.99876	valid_1's au

In [None]:
# Parameter 2

for fold, (train_ind, val_ind) in enumerate(skf.split(df_train, df_train.target.values)):
    
    X_train, y_train = df_train.iloc[train_ind][predictors], df_train.iloc[train_ind]['target']
    X_valid, y_valid = df_train.iloc[val_ind][predictors], df_train.iloc[val_ind]['target']

    N = 1
    p_valid, yp = 0, 0
        
    for i in range(N):
        print('\nFold {} - N {}'.format(fold + 1, i + 1))
        
        X_t, y_t = oversampling(X_train.values, y_train.values)
        weights = np.array([0.8] * X_t.shape[0])
        weights[:X_train.shape[0]] = 1.0
        print('Shape of X_train after augment: {}\nShape of y_train after augment: {}'.format(X_t.shape, y_t.shape))
        
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t, weight=weights)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        
        lgb_clf = lgb.train(gbdt_param2, trn_data, 100000, valid_sets=[trn_data, val_data], early_stopping_rounds=5000, verbose_eval=1000, evals_result=evals_result)
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
        

    predictions['fold{}'.format(fold + 1)] = yp / N

predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']

sub_df.to_csv("/content/drive/MyDrive/Data Science Project - Team D/submission/submission_parameter2.csv", index=False)


Fold 1 - N 1
Shape of X_train after augment: (336079, 925)
Shape of y_train after augment: (336079,)
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.986425	valid_1's auc: 0.920051
[2000]	training's auc: 0.99541	valid_1's auc: 0.918751
[3000]	training's auc: 0.998887	valid_1's auc: 0.917847
[4000]	training's auc: 0.999828	valid_1's auc: 0.916852
[5000]	training's auc: 0.999984	valid_1's auc: 0.916367
[6000]	training's auc: 1	valid_1's auc: 0.916164
Early stopping, best iteration is:
[1221]	training's auc: 0.988936	valid_1's auc: 0.920277

Fold 2 - N 1
Shape of X_train after augment: (336079, 925)
Shape of y_train after augment: (336079,)
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.987129	valid_1's auc: 0.919626
[2000]	training's auc: 0.995476	valid_1's auc: 0.91925
[3000]	training's auc: 0.998942	valid_1's auc: 0.917566
[4000]	training's auc: 0.999842	valid_1's auc: 0.916651
[5000]	training's auc: 0.9

In [None]:
# Parameter 3

for fold, (train_ind, val_ind) in enumerate(skf.split(df_train, df_train.target.values)):
    
    X_train, y_train = df_train.iloc[train_ind][predictors], df_train.iloc[train_ind]['target']
    X_valid, y_valid = df_train.iloc[val_ind][predictors], df_train.iloc[val_ind]['target']

    N = 1
    p_valid, yp = 0, 0
        
    for i in range(N):
        print('\nFold {} - N {}'.format(fold + 1, i + 1))
        
        X_t, y_t = oversampling(X_train.values, y_train.values)
        weights = np.array([0.8] * X_t.shape[0])
        weights[:X_train.shape[0]] = 1.0
        print('Shape of X_train after augment: {}\nShape of y_train after augment: {}'.format(X_t.shape, y_t.shape))
        
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t, weight=weights)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        
        lgb_clf = lgb.train(gbdt_param3, trn_data, 100000, valid_sets=[trn_data, val_data], early_stopping_rounds=5000, verbose_eval=1000, evals_result=evals_result)
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
        

    predictions['fold{}'.format(fold + 1)] = yp / N

predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']

sub_df.to_csv("/content/drive/MyDrive/Data Science Project - Team D/submission/submission_parameter3.csv", index=False)


Fold 1 - N 1
Shape of X_train after augment: (336079, 925)
Shape of y_train after augment: (336079,)
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.970859	valid_1's auc: 0.880913
[2000]	training's auc: 0.986642	valid_1's auc: 0.905425
[3000]	training's auc: 0.991906	valid_1's auc: 0.914256
[4000]	training's auc: 0.995076	valid_1's auc: 0.91812
[5000]	training's auc: 0.997286	valid_1's auc: 0.919448
[6000]	training's auc: 0.998563	valid_1's auc: 0.919828
[7000]	training's auc: 0.999272	valid_1's auc: 0.919931
[8000]	training's auc: 0.999653	valid_1's auc: 0.920045
[9000]	training's auc: 0.999847	valid_1's auc: 0.919994
[10000]	training's auc: 0.999939	valid_1's auc: 0.919918
[11000]	training's auc: 0.999979	valid_1's auc: 0.919847
[12000]	training's auc: 0.999994	valid_1's auc: 0.919812
[13000]	training's auc: 0.999999	valid_1's auc: 0.919858
Early stopping, best iteration is:
[8041]	training's auc: 0.999666	valid_1's auc: 0.920062

Fold 2 - N 

In [None]:
# Parameter 4

for fold, (train_ind, val_ind) in enumerate(skf.split(df_train, df_train.target.values)):
    
    X_train, y_train = df_train.iloc[train_ind][predictors], df_train.iloc[train_ind]['target']
    X_valid, y_valid = df_train.iloc[val_ind][predictors], df_train.iloc[val_ind]['target']

    N = 1
    p_valid, yp = 0, 0
        
    for i in range(N):
        print('\nFold {} - N {}'.format(fold + 1, i + 1))
        
        X_t, y_t = oversampling(X_train.values, y_train.values)
        weights = np.array([0.8] * X_t.shape[0])
        weights[:X_train.shape[0]] = 1.0
        print('Shape of X_train after augment: {}\nShape of y_train after augment: {}'.format(X_t.shape, y_t.shape))
        
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t, weight=weights)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        
        lgb_clf = lgb.train(gbdt_param4, trn_data, 100000, valid_sets=[trn_data, val_data], early_stopping_rounds=5000, verbose_eval=1000, evals_result=evals_result)
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
        

    predictions['fold{}'.format(fold + 1)] = yp / N

predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']

sub_df.to_csv("/content/drive/MyDrive/Data Science Project - Team D/submission/submission_parameter4.csv", index=False)


Fold 1 - N 1
Shape of X_train after augment: (336079, 925)
Shape of y_train after augment: (336079,)
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.95899	valid_1's auc: 0.871865
[2000]	training's auc: 0.977545	valid_1's auc: 0.899313
[3000]	training's auc: 0.984191	valid_1's auc: 0.91081
[4000]	training's auc: 0.987338	valid_1's auc: 0.916571
[5000]	training's auc: 0.989273	valid_1's auc: 0.919781
[6000]	training's auc: 0.990762	valid_1's auc: 0.921121
[7000]	training's auc: 0.992066	valid_1's auc: 0.922018
[8000]	training's auc: 0.993247	valid_1's auc: 0.922464
[9000]	training's auc: 0.994306	valid_1's auc: 0.922538
[10000]	training's auc: 0.99523	valid_1's auc: 0.922678
[11000]	training's auc: 0.996052	valid_1's auc: 0.922595
[12000]	training's auc: 0.996759	valid_1's auc: 0.922524
[13000]	training's auc: 0.997374	valid_1's auc: 0.922499
[14000]	training's auc: 0.997882	valid_1's auc: 0.922363
[15000]	training's auc: 0.998315	valid_1's auc: 

In [None]:
# Parameter 5

for fold, (train_ind, val_ind) in enumerate(skf.split(df_train, df_train.target.values)):
    
    X_train, y_train = df_train.iloc[train_ind][predictors], df_train.iloc[train_ind]['target']
    X_valid, y_valid = df_train.iloc[val_ind][predictors], df_train.iloc[val_ind]['target']

    N = 1
    p_valid, yp = 0, 0
        
    for i in range(N):
        print('\nFold {} - N {}'.format(fold + 1, i + 1))
        
        X_t, y_t = oversampling(X_train.values, y_train.values)
        weights = np.array([0.8] * X_t.shape[0])
        weights[:X_train.shape[0]] = 1.0
        print('Shape of X_train after augment: {}\nShape of y_train after augment: {}'.format(X_t.shape, y_t.shape))
        
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t, weight=weights)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        
        lgb_clf = lgb.train(gbdt_param4, trn_data, 100000, valid_sets=[trn_data, val_data], early_stopping_rounds=5000, verbose_eval=1000, evals_result=evals_result)
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
        

    predictions['fold{}'.format(fold + 1)] = yp / N

predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']

sub_df.to_csv("/content/drive/MyDrive/Data Science Project - Team D/submission/submission_parameter5.csv", index=False)


Fold 1 - N 1
Shape of X_train after augment: (336079, 925)
Shape of y_train after augment: (336079,)
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.957508	valid_1's auc: 0.871449
[2000]	training's auc: 0.976957	valid_1's auc: 0.898887
[3000]	training's auc: 0.983754	valid_1's auc: 0.910567
[4000]	training's auc: 0.987115	valid_1's auc: 0.916465
[5000]	training's auc: 0.989097	valid_1's auc: 0.919707
[6000]	training's auc: 0.990635	valid_1's auc: 0.921197
[7000]	training's auc: 0.991945	valid_1's auc: 0.921957
[8000]	training's auc: 0.99315	valid_1's auc: 0.922445
[9000]	training's auc: 0.99423	valid_1's auc: 0.922485
[10000]	training's auc: 0.995162	valid_1's auc: 0.922614
[11000]	training's auc: 0.995988	valid_1's auc: 0.922537
[12000]	training's auc: 0.996702	valid_1's auc: 0.922593
[13000]	training's auc: 0.997324	valid_1's auc: 0.922589
[14000]	training's auc: 0.997844	valid_1's auc: 0.922512
[15000]	training's auc: 0.998282	valid_1's auc: