# TPS-Feb-2022

In [None]:
NB = '104'
dataset_NB = '002'

## Import libralies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import time

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, early_stopping

## Load and check data

In [None]:
# Load data
##### Load train and Test set
train = pd.read_pickle(f"../data/processed/nb{dataset_NB}_train.pkl", compression='zip')
test = pd.read_pickle(f"../data/processed/nb{dataset_NB}_test.pkl", compression='zip')

submission = pd.read_csv('../data/raw/sample_submission.csv')

In [None]:
train_len = len(train)
test_id = test['row_id']

#dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
# reset_index: indexを0から順に振り直す
# drop: Falseの場合、元のindexが「index」列が新たに生成されて残る。Trueの場合「index」列は作られない。

#dataset = dataset.drop(columns=['row_id'])
#train = train.drop(columns=['row_id'])

#dataset.head()

## Parameter Setting

In [None]:
RANDOM_STATE = 13
FOLDS = 4
TARGET = 'target'
FEATURES = [col for col in train.columns if col not in ['row_id', TARGET]]

FEATURES

## Modeling

### 目的変数（target）を数値に変換する

In [None]:
encoder = LabelEncoder()
train[TARGET] = encoder.fit_transform(train[TARGET])

In [None]:
train.head()

### LightGBM

In [None]:
lgb_fixed_params = {
    'objective' : 'multiclass',
    'metric' : 'multi_logloss',
    "n_estimators": 3000,
    'learning_rate': 0.1,
}

#fit_params = {"early_stopping_rounds": 100,
#            "eval_set": [[X_test, y_test]]}


# LightGBM parameters
lgb_param_grid = {
                 #'learning_rate': [0.1, 0.05],
                 'num_leaves' : [31, 63],
                 #'num_leaves' : [12, 15, 18],
                 'max_depth'  : [15, 21],
                 #'max_depth'  : [6, 9, 15],
                 #'min_gain_to_split' : [0, 0.1, 0.2],
                 #'feature_fraction' : [0.5, 0.7, 1],
                 #'bagging_fraction' : [0.7, 0.9, 1],
                 #'min_sum_hessian_in_leaf' : [1, 2, 4],
                }

callbacks = [early_stopping(50)]

In [None]:
lgb_predictions = []
lgb_scores = []
lgb_feature_importance = []
lgb_result = {}

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)

for fold, (train_idx, val_idx) in enumerate(skf.split(train[FEATURES], train[TARGET])):

    print(10*"=", f"Fold={fold+1}", 10*"=")
    start_time = time.time()

    ### データセット作成
    X_train, X_val = train.iloc[train_idx][FEATURES], train.iloc[val_idx][FEATURES]
    y_train, y_val = train[TARGET].iloc[train_idx] , train[TARGET].iloc[val_idx]

    ### 学習
    lgb_model = LGBMClassifier(**lgb_fixed_params)

    # gs = GridSearchCV(model, param_grid=lgb_param_grid, fit_params=lgb_fixed_params, cv=FOLDS, n_jobs=-1, verbose=2)
    gs_lgb = GridSearchCV(lgb_model, param_grid=lgb_param_grid, cv=FOLDS, n_jobs=-1, verbose=2)
    gs_lgb.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0, callbacks=callbacks)

    #model.fit(X_train, y_train,verbose=0)
    #model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0, callbacks=callbacks)

    ### 結果格納
    lgb_result[f'Fold{fold}_best_estimator'] = gs_lgb.best_estimator_
    lgb_result[f'Fold{fold}_best_score'] = gs_lgb.best_score_
    lgb_result[f'Fold{fold}_best_grid_params'] = gs_lgb.best_params_
    lgb_result[f'Fold{fold}_best_all_params'] = lgb_result[f'Fold{fold}_best_estimator'].get_params()
    lgb_result[f'Fold{fold}_cv_result'] = pd.DataFrame(gs_lgb.cv_results_)

    ### Best Score
    print(f"Best Score: {gs_lgb.best_score_}")
    print(f"Best Param: {gs_lgb.best_params_}")

    ### 推論（validation）
    preds_val = gs_lgb.predict(X_val)
    acc = accuracy_score(y_val, preds_val)
    lgb_scores.append(acc)

    ### 結果格納
    lgb_result[f'Fold{fold}_preds_val'] = preds_val
    lgb_result[f'Fold{fold}_y_val'] = y_val
    lgb_result[f'Fold{fold}_acc'] = acc

    ### feature importance
    feat_imp = pd.DataFrame(index=FEATURES, data=lgb_result[f'Fold{fold}_best_estimator'].feature_importances_, columns=[f'{fold}_importance'])
    lgb_feature_importance.append(feat_imp)

    ### 推論（test）
    test_preds = gs_lgb.predict(test[FEATURES])
    lgb_predictions.append(test_preds)

    ### 結果表示
    run_time = time.time() - start_time
    print(f"Fold={fold+1}, Accuracy: {acc:.5f}, Run Time: {run_time:.2f}s")

print(10*"=", "Cross Validation finished.", 10*"=")
print("Mean Accuracy :", np.mean(lgb_scores))
print(lgb_result)

### feature importances

In [None]:
lgbm_fis_df = pd.concat(lgb_feature_importance, axis=1).head(15)
lgbm_fis_df.sort_values('1_importance').plot(kind='barh', figsize=(15, 10), title='Feature Importance Across Folds')
plt.show()

## Submission

In [None]:
lgb_submission = submission.copy()
lgb_submission[TARGET] = encoder.inverse_transform(np.squeeze(mode(np.column_stack(lgb_predictions),axis = 1)[0]).astype('int'))
#lgb_submission[TARGET] = np.squeeze(mode(np.column_stack(lgb_predictions),axis = 1)[0]).astype('int')
### 列方向にリストを行列化して、各行の最頻値をとって、１次元の配列は削除して、intにして、数値をラベルに戻している

lgb_submission.to_csv(f"../data/submission/nb{NB}_LGBM.csv",index=False)
lgb_submission.head()

## 検証メモ

In [None]:
fit_params = {"early_stopping_rounds": 100,
              "eval_set": [[X_test, y_test]]}

xgb_model = xgb.XGBClassifier()
gs = GridSearchCV(xgb_model,
                  params,
                  fit_params=fit_params,
                  cv=10,
                  n_jobs=-1,
                  verbose=2)
gs.fit(X_train, y_train)

In [None]:
from copy import deepcopy
from itertools import product
from collections import defaultdict

def GridSearchCV_XGB_early_stoppping(param_grid, param_fixed, scorer, cv, X, y):
    """This function performs grid search for the best set of parameters of XGBoost model with early stopping.

    Args:
        param_grid (dict): The parameter ranges for which the function searches.
        param_fixed (dict): The fitting parameters for XGBoost.
        scorer (_PredictScorer): The sklearn's scorer instance.
        cv (model_selection._split): The sklearn's split instance.
        X (DataFrame): The input data matrix.
        y (Series): The ground truth label.

    Returns:
        dict: The best set of parameters found via grid search.
    """
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
    if isinstance(y, pd.Series):
        y = y.to_numpy()

    param_names, param_values = zip(*list(param_grid.items()))

    cv_best_iterations = defaultdict(list)
    cv_results = defaultdict(list)

    for train_index, test_index in cv.split(X, y):
        gscv_x_train, gscv_x_val = X[train_index], X[test_index]
        gscv_y_train, gscv_y_val = y[train_index], y[test_index]

        param_fixed_cv = deepcopy(param_fixed)
        param_fixed_cv['eval_set'] = [(gscv_x_val, gscv_y_val)]

        for value_combination in product(*param_values):
            param_grid_cv = tuple(zip(param_names, value_combination))
            xgboost = XGBRegressor(**dict(param_grid_cv))

            xgboost.fit(gscv_x_train, gscv_y_train, **param_fixed_cv)
            if 'early_stopping_rounds' not in param_fixed_cv:
                best_iteration = xgboost.get_num_boosting_rounds()
            else:
                best_iteration = xgboost.best_iteration
            cv_best_iterations[param_grid_cv].append(best_iteration)

            score = scorer(xgboost, gscv_x_val, gscv_y_val)
            cv_results[param_grid_cv].append(score)

    best_params_xgb, score_list = max(cv_results.items(), key=lambda x: np.array(x[1]).mean())

    # Note that our XGBoost model may stop early,
    # so we calculate the mean of the actual number of estimators in each fold,
    # in place of the originally planned n_estimators after finishing cross validation.
    n_estimators = int(round(np.array(cv_best_iterations[best_params_xgb]).mean()))

    best_params_xgb = dict(best_params_xgb)
    best_params_xgb['n_estimators'] = n_estimators

    print ("Best score: {:.3f}".format(np.array(score_list).mean()))
    print ("Best Parameters: {}".format(best_params_xgb))

    return best_params_xgb