In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

# warningsを非表示にする
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
y_col = "MIS_Status"

cat_col = [
    "RevLineCr",
    "LowDoc",
    "Sector",
    "State",
    "BankState",
    "FranchiseCode",
]

In [3]:
train_data = pd.read_csv("train.csv", index_col=0)
test_data = pd.read_csv("test.csv", index_col=0)
ss = pd.read_csv("sample_submission.csv", header=None)

In [4]:
# train_data.head()

In [5]:
def basic_info(df):
    rows = []
    for col in df.columns:
        rows.append([col, df[col].dtype, df[col].isnull().sum(), len(df[col].unique())])
    return pd.DataFrame(rows, columns=["col", "type", "num_NaN", "val_warm"])

In [6]:
basic_info(train_data)

Unnamed: 0,col,type,num_NaN,val_warm
0,Term,int64,0,228
1,NoEmp,int64,0,196
2,NewExist,float64,0,2
3,CreateJob,int64,0,49
4,RetainedJob,int64,0,83
5,FranchiseCode,int64,0,271
6,RevLineCr,object,1079,5
7,LowDoc,object,531,7
8,DisbursementDate,object,150,917
9,MIS_Status,int64,0,2


In [7]:
def preprocessing(df, replace_dict=None, ce_dict=None):
    """
    データフレームに対する前処理を行います。

    Parameters:
    - df (pd.DataFrame): 前処理を行うデータフレーム。
    - replace_dict (dict, optional): label encode のための辞書。列名を入れると対応する label encode の数字が得られます。
    - ce_dict (dict, optional): カテゴリカル変数のデータ量を格納する辞書。列名を入れるとそのカテゴリのデータがどのくらいあるかがわかります。

    Returns:
    - pd.DataFrame: 前処理が適用されたデータフレーム。
    - dict: label encode 用の辞書。列名を入れると対応する label encode の数字が得られます。
    - dict: カテゴリカル変数のデータ量を格納する辞書。列名を入れるとそのカテゴリのデータがどのくらいあるかがわかります。
    """
    # Cityは汎用性が低いと考えられるためDrop
    df = df.drop("City", axis=1)

    # Sector, FranchiseCode
    # 32,33→31, 45→44, 49→48に変換
    code_dict = {
        32: 31,
        33: 31,
        45: 44,
        49: 48
    }
    df["Sector"] = df["Sector"].replace(code_dict)

    # RevLineCr, LowDoc
    # YN以外　→ NaN
    revline_dict = {'0': np.nan, 'T': np.nan}
    df["RevLineCr"] = df["RevLineCr"].replace(revline_dict)

    lowdoc_dict = {'C': np.nan, '0': np.nan, 'S': np.nan, 'A': np.nan}
    df["LowDoc"] = df["LowDoc"].replace(lowdoc_dict)

    # DisbursementDate, ApprovalDate
    # 日付型へ変更し年を抽出
    df['DisbursementDate'] = pd.to_datetime(df['DisbursementDate'], format='%d-%b-%y')
    df["DisbursementYear"] = df["DisbursementDate"].dt.year
    df.drop(["DisbursementDate", "ApprovalDate"], axis=1, inplace=True)

    # 本来数値型のものを変換する
    cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
    df[cols] = df[cols].applymap(lambda x: x.strip().replace('$', '').replace(',', '')).astype(float).astype(int)

    # 特徴量エンジニアリング
    df["FY_Diff"] = df["ApprovalFY"] - df["DisbursementYear"]
    df["State_is_BankState"] = (df["State"] == df["BankState"])
    df["State_is_BankState"] = df["State_is_BankState"].replace({True: 1, False: 0})

    df['SBA_Portion'] = df['SBA_Appv'] / df['GrAppv']
    df["DisbursementGrossRatio"] = df["DisbursementGross"] / df["GrAppv"]
    df["MonthlyRepayment"] = df["GrAppv"] / df["Term"]
    df["NullCount"] = df.isnull().sum(axis=1)

    # カテゴリカル変数の設定  nanと新規値: -1とする
    df[cat_col] = df[cat_col].fillna(-1)

    # train
    if replace_dict is None:
        # countencode, labelencode
        # ce_dict: 列名を入れるとそのカテゴリのデータがどのくらいあるかを返してくれます
        # replace_dict: 列名を入れるとlabelencodeのための数字を返す
        ce_dict = {}
        replace_dict = {}
        for col in cat_col:
            replace_dict[col] = {}
            vc = df[col].value_counts()
            ce_dict[col] = vc
            replace_dict_in_dict = {}
            for i, k in enumerate(vc.keys()):
                replace_dict_in_dict[k] = i
            replace_dict[col] = replace_dict_in_dict
            df[f"{col}_CountEncode"] = df[col].replace(vc).astype(int)
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df, replace_dict, ce_dict

    # test
    else:
        for col in cat_col:
            # Count Encode
            test_vals_uniq = df[col].unique()
            ce_dict_in_dict = ce_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in ce_dict_in_dict.keys():
                    ce_dict_in_dict[test_val] = -1
            df[f"{col}_CountEncode"] = df[col].replace(ce_dict_in_dict).astype(int)

            # Label Encode
            test_vals_uniq = df[col].unique()
            replace_dict_in_dict = replace_dict[col]
            for test_val in test_vals_uniq:
                if test_val not in replace_dict_in_dict.keys():
                    replace_dict_in_dict[test_val] = -1
            df[col] = df[col].replace(replace_dict_in_dict).astype(int)
        return df

In [8]:
train_data, replace_dict, ce_dict = preprocessing(train_data)
test_data = preprocessing(test_data, replace_dict=replace_dict, ce_dict=ce_dict)

In [9]:
# test_data.head()

In [10]:
# 前処理の確認
basic_info(train_data)

Unnamed: 0,col,type,num_NaN,val_warm
0,Term,int64,0,228
1,NoEmp,int64,0,196
2,NewExist,float64,0,2
3,CreateJob,int64,0,49
4,RetainedJob,int64,0,83
5,FranchiseCode,int64,0,271
6,RevLineCr,int64,0,3
7,LowDoc,int64,0,3
8,MIS_Status,int64,0,2
9,Sector,int64,0,20


In [11]:
def MIS_Status_corr_confirm(df, y_col):
    s_per = df.corr("pearson")[y_col].sort_values()
    s_spr = df.corr("spearman")[y_col].sort_values()
    df_corr = pd.concat([s_per, s_spr], axis=1)
    df_corr.columns = ["Pearson", "Spearman"]

    # 平均値でソート
    return df_corr.loc[df_corr.mean(axis=1).sort_values(ascending=False).keys(), :].drop(y_col)

In [12]:
MIS_Status_corr_confirm(train_data, y_col)

Unnamed: 0,Pearson,Spearman
RevLineCr_CountEncode,0.13885,0.127572
NoEmp,0.09294,0.171855
Term,0.122125,0.119292
LowDoc_CountEncode,0.107665,0.114348
Sector_CountEncode,0.10451,0.105012
DisbursementGrossRatio,0.047301,0.032333
FY_Diff,0.034412,0.042713
State,0.024825,0.02683
BankState,0.009405,0.005933
DisbursementGross,0.000481,0.000447


In [13]:
X_train = train_data.drop(y_col, axis=1)
y_train = train_data[y_col]

In [14]:
# LightGBM
params_lgb = {
    "n_estimators": 3000,
    "learning_rate": 0.01,
    "colsample_bytree": 0.8,
    "subsample_freq": 1,
    "subsample": 0.8,
    "random_seed": 0,
}

In [15]:
# F1-scoreの最適化を見つける
def f1_optimization(val_y, preds_y_proba):
    mean_f1_list = []
    fpr, tpr, thresholds = metrics.roc_curve(val_y, preds_y_proba)
    for threshold in thresholds:
        preds_y = [1 if prob > threshold else 0 for prob in preds_y_proba]
        mean_f1_list.append(f1_score(val_y, preds_y, average='macro'))
    return np.max(mean_f1_list), thresholds[np.argmax(mean_f1_list)]

In [16]:
list_metrics_auc = []
list_metrics_f1 = []
list_cutoff = []
list_models = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
for fold, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train), start=1):
    trn_x = X_train.iloc[trn_idx, :]
    trn_y = y_train[trn_idx]
    val_x = X_train.iloc[val_idx, :]
    val_y = y_train[val_idx]
    model_lgb = lgb.LGBMClassifier(**params_lgb)
    model_lgb.fit(
        trn_x, trn_y,
        eval_set=(val_x, val_y),
        callbacks=[lgb.early_stopping(100, verbose=True)],
        categorical_feature=cat_col,
    )
    list_models.append(model_lgb)
    preds_y_proba = model_lgb.predict_proba(val_x)[:, 1]
    auc = roc_auc_score(val_y, preds_y_proba)
    f1, threshold = f1_optimization(val_y, preds_y_proba)
    list_metrics_auc.append(auc)
    list_metrics_f1.append(f1)
    list_cutoff.append(threshold)
    print(f"Fold: {fold}, AUC: {auc}, f1 score: {f1} Threshold: {threshold}")

[LightGBM] [Info] Number of positive: 25178, number of negative: 3026
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2476
[LightGBM] [Info] Number of data points in the train set: 28204, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.892710 -> initscore=2.118729
[LightGBM] [Info] Start training from score 2.118729
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[507]	valid_0's binary_logloss: 0.286519
Fold: 1, AUC: 0.7644520551323192, f1 score: 0.6675970219878118 Threshold: 0.7538001356190227
[LightGBM] [Info] Number of positive: 25178, number of negative: 3027
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001604 seconds.
You can set `force_row_wise=true` to rem

In [17]:
# AUC, f1-score, cutoff
print(np.mean(list_metrics_auc), np.mean(list_metrics_f1), np.median(list_cutoff))

0.7735209741638948 0.674562094711921 0.7237334150076405


In [19]:
threshold = np.median(list_cutoff)
preds_y_proba = np.zeros(len(test_data))
for model in list_models:
    preds_y_proba += model.predict_proba(test_data[model.feature_name_])[:, 1] / len(list_models)
preds_y = [1 if prob > threshold else 0 for prob in preds_y_proba]

In [20]:
ss[1] = preds_y
ss[1] = ss[1].astype(int)
ss.to_csv("submit.csv", header=False, index=False)

In [21]:
ss[1].value_counts()

1    39433
0     2875
Name: 1, dtype: int64

In [22]:
%run model_lgb.py

[LightGBM] [Info] Number of positive: 25178, number of negative: 3026
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2474
[LightGBM] [Info] Number of data points in the train set: 28204, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.892710 -> initscore=2.118729
[LightGBM] [Info] Start training from score 2.118729
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[545]	valid_0's binary_logloss: 0.286546
Fold: 1, AUC: 0.7643170323466011, f1 score: 0.6681857310163786 Threshold: 0.7618115348767347
[LightGBM] [Info] Number of positive: 25178, number of negative: 3027
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001818 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `forc

Unnamed: 0,Feature,Importance
24,State,1512.666667
1,BankState,1305.333333
26,Term,1285.0
15,NoEmp,1080.333333
13,MonthlyRepayment,1025.666667
6,DisbursementYear,968.666667
7,FY_Diff,932.0
20,SBA_Appv,887.333333
0,ApprovalFY,875.333333
4,DisbursementGross,790.0
