In [None]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import optuna 
from optuna.visualization import plot_intermediate_values

import missingno as msno

sns.set()

In [None]:
features = pd.read_csv('../data/processed/features.csv')
app_train_sample = pd.read_csv('../data/raw/application_train.csv', nrows=1)

In [None]:
categ_var = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'OCCUPATION_TYPE']

In [None]:
msno.matrix(features)

In [None]:
for col in features.columns:
    if col not in app_train_sample.columns:
        features[col].fillna(0, inplace=True)

In [None]:
msno.matrix(features)

In [None]:
no_significant = ['CREDIT_ACTIVE_Sold', 'CREDIT_ACTIVE_Bad debt', 'AMT_INCOME_TOTAL']
train = features[features['TARGET'].notna()].copy()
test = features[features['TARGET'].isna()].copy()
target = train['TARGET']
train.drop(columns=['TARGET', 'SK_ID_CURR'], inplace=True)
test.drop(columns=['TARGET', 'SK_ID_CURR'], inplace=True)
train.drop(columns=no_significant, inplace=True)
test.drop(columns=no_significant, inplace=True)

In [None]:
imputer = SimpleImputer(strategy='median')
scaler = MinMaxScaler(feature_range=(0, 1))
imputer.fit(train)
scaler.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

train = scaler.transform(train)
test = scaler.transform(test)

features_names = list(features.drop(columns=['TARGET', 'SK_ID_CURR'] + no_significant).columns)

print(f'train set shape : {train.shape}')
print(f'test set shape : {test.shape}')

In [None]:
train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target)

print(f'train set shape : {X_train.shape}')
print(f'test set shape : {X_test.shape}')

In [None]:
train = lgb.Dataset(train, label=target, feature_name=features_names)
train_data = lgb.Dataset(X_train, label=y_train, feature_name=features_names)
test_data = lgb.Dataset(X_test, label=y_test, feature_name=features_names, reference=train_data)

In [None]:
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1000, 100000)
    }
    
    evals_results = dict()
    
    bst = lgb.train(param, train_data, verbose_eval=-1, evals_result=evals_results,
                    valid_sets=[train_data, test_data], early_stopping_rounds=30,
                    num_boost_round=1000)
    return bst.best_score['valid_1']['auc']

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
study.best_trial

In [None]:
study.best_params

In [None]:
from optuna.visualization import plot_optimization_history 
plot_optimization_history(study)

In [None]:
from optuna.visualization import plot_parallel_coordinate
plot_parallel_coordinate(study)

In [None]:
plot_parallel_coordinate(study, params=['bagging_fraction', 'bagging_freq'])

In [None]:
plot_parallel_coordinate(study, params=['lambda_l1', 'lambda_l2'])

In [None]:
from optuna.visualization import plot_contour
plot_contour(study, params=['lambda_l1', 'lambda_l2'])

In [None]:
plot_contour(study, params=['bagging_fraction', 'feature_fraction'])

In [None]:
param = study.best_params
param['metric'] = 'auc'
param['objective'] = 'binary'
evals_results = dict()

bst = lgb.train(param, train, num_boost_round=250)

In [None]:
bst.predict(test)

In [None]:
app_test = pd.read_csv('../data/raw/application_test.csv')
res = app_test[['SK_ID_CURR']].copy()
res['TARGET'] = bst.predict(test)
path = os.path.join(os.path.abspath('../reports/'), 'lgbm_opt_15_var.csv')
res.to_csv(path, index=False)

Score Kaggle : 0.74005

Update : 0.74148 (15 variables)

In [None]:
import pickle