<a href="https://colab.research.google.com/github/tokuton1024/Financecirculation/blob/master/TabularPlayground_Series_Sep2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [13]:
N_SPLITS = 5
N_ESTIMATORS = 20000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED = 2021

In [14]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [15]:
INPUT = "/content/drive/My Drive/Tabular Playground Series - Sep 2021/Input/"

train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")


In [16]:
Data = "/content/drive/My Drive/Tabular Playground Series - Sep 2021/Data/"
submission = pd.read_csv(Data + "sample_solution.csv")

features = [col for col in test.columns if 'f' in col]
TARGET = 'claim'

target = train[TARGET].copy()

In [17]:
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

features += ['n_missing', 'std']

In [18]:
train[features] = train[features].fillna(train[features].mean())
test[features] = test[features].fillna(test[features].mean())

In [19]:
scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [20]:
train.shape, test.shape

((957919, 122), (493474, 121))

#LGBMClassifier

In [21]:
lgb_params = {
    'objective': 'binary',
    'n_estimators': N_ESTIMATORS,
    'random_state': SEED,
    'learning_rate': 5e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'importance_type': 'gain',
}

In [None]:
lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])
lgb_importances = pd.DataFrame()

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train)):
    print(f"===== fold {fold} =====")
    X_train = train[features].iloc[trn_idx]
    y_train = target.iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = target.iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )
    


===== fold 0 =====
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.514806	valid_0's auc: 0.810282
[2000]	valid_0's binary_logloss: 0.509746	valid_0's auc: 0.812608
[3000]	valid_0's binary_logloss: 0.508828	valid_0's auc: 0.813692
[4000]	valid_0's binary_logloss: 0.508353	valid_0's auc: 0.814297
[5000]	valid_0's binary_logloss: 0.508052	valid_0's auc: 0.814711
[6000]	valid_0's binary_logloss: 0.507854	valid_0's auc: 0.814982
[7000]	valid_0's binary_logloss: 0.507705	valid_0's auc: 0.815204
[8000]	valid_0's binary_logloss: 0.507599	valid_0's auc: 0.815373
[9000]	valid_0's binary_logloss: 0.507525	valid_0's auc: 0.815492
[10000]	valid_0's binary_logloss: 0.507446	valid_0's auc: 0.815597
[11000]	valid_0's binary_logloss: 0.507396	valid_0's auc: 0.815672
Early stopping, best iteration is:
[11365]	valid_0's binary_logloss: 0.507377	valid_0's auc: 0.815698
===== fold 1 =====
Training until validation scores don't improve for 200 rounds.
[1000]

# feature_nameについてhttps://stackoverflow.com/questions/60323854/attributeerror-lgbmregressor-object-has-no-attribute-feature-name

In [None]:
    fi_tmp = pd.DataFrame()
    fi_tmp['feature'] = model.booster_.feature_name()
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = fold
    fi_tmp['seed'] = SEED
    lgb_importances = lgb_importances.append(fi_tmp)

    lgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    lgb_pred += model.predict_proba(X_test)[:, -1] / N_SPLITS

    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, lgb_oof[val_idx])
    print(f"fold {fold} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof lgb roc = {roc_auc_score(target, lgb_oof)}")

np.save("lgb_oof.npy", lgb_oof)
np.save("lgb_pred.npy", lgb_pred)