In [1]:
import numpy as np
import pandas as pd
import os

## 資料探索與前處理

### 訓練資料的目錄

In [2]:
projDir = "/kaggle/input/home-credit-credit-risk-model-stability/"
trainDir = os.path.join(projDir, "csv_files/train/")

### Join Table

In [3]:
# 讀取訓練數據

# base.csv
path = os.path.join(trainDir, 'train_base.csv')
train_base = pd.read_csv(path)

# static (depth=0)
path = os.path.join(trainDir, 'train_static_0_0.csv')
train_static0 = pd.read_csv(path)
path = os.path.join(trainDir, 'train_static_0_1.csv')
train_static1 = pd.read_csv(path)
train_static = pd.concat([train_static0, train_static1], ignore_index=True)
del train_static0, train_static1

# static_cb (depth=0)
path = os.path.join(trainDir, 'train_static_cb_0.csv')
train_static_cb = pd.read_csv(path)

# Join Table
train_data = pd.merge(train_base, train_static, how="left", on="case_id")
train_data = pd.merge(train_data, train_static_cb, how="left", on="case_id")
del train_base, train_static, train_static_cb

  train_static0 = pd.read_csv(path)
  train_static1 = pd.read_csv(path)
  train_static_cb = pd.read_csv(path)


### 刪除非數字的column

In [4]:
for col in train_data.columns:
    if not (train_data[col].dtype in (np.int64, np.float64)):
        train_data = train_data.drop(col, axis=1)

### 將目標變量和特徵變量分開

In [5]:
X = train_data.drop(columns=['target'])
y = train_data['target']

### 處理缺失值

In [6]:
X

Unnamed: 0,case_id,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,...,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
0,0,201901,0,,,1917.6000,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1,201901,0,,,3134.0000,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,2,201901,0,,,4937.0000,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,3,201901,0,,,4643.6000,0.0,0.0,1.0,0.0,...,,,,,,,,,,
4,4,201901,0,,,3390.2000,0.0,0.0,1.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,202010,91,0.0,176561.36,3675.4001,0.0,0.0,0.0,0.0,...,,12155.0,,12.0,,,,,1.0,1.0
1526655,2703451,202010,91,0.0,301276.47,7088.6000,6191.6,0.0,0.0,5.0,...,,22904.6,,12.0,,,,,1.0,2.0
1526656,2703452,202010,91,0.0,14232.40,7788.8003,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,4.0
1526657,2703453,202010,91,0.0,197371.58,1195.4000,2827.2,0.0,0.0,36.0,...,,15792.4,,14.0,,,,,2.0,1.0


In [7]:
# 删除缺失值較多的row

# 輸出原始資料集資訊
print("原始資料集形狀：", X.shape)

threshold = 0.5  # 設定缺失比例的閾值，超過閾值的rowc和column將被刪除

# 計算每column的缺失值比例
missing_ratios = X.isnull().mean(axis=0)

# 找出缺失比例超過閾值的column
columns_to_drop = missing_ratios[missing_ratios > threshold].index

# 刪除缺失比例超過閾值的column
X = X.drop(columns=columns_to_drop, axis=1)

# 輸出刪除column後的資料集資訊
print("刪除缺失比例過高的列後的資料集形狀：", X.shape)

# 計算每row的缺失值比例
missing_ratios = X.isnull().mean(axis=1)

# 找出缺失比例超過閾值的row
rows_to_drop = missing_ratios[missing_ratios > threshold].index

# 刪除缺失比例超過閾值的row
X = X.drop(index=rows_to_drop)
y = y.drop(index=rows_to_drop)

# 輸出刪除row後的資料集資訊
print("刪除缺失比例過高的行後的資料集形狀：", X.shape)

原始資料集形狀： (1526659, 167)
刪除缺失比例過高的列後的資料集形狀： (1526659, 110)
刪除缺失比例過高的行後的資料集形狀： (1447840, 110)


In [8]:
# 刪除有極端值的row

# Define a function to detect outliers in a given column
def detect_outliers(column, threshold=3):
    mean = np.mean(column)
    std_dev = np.std(column)
    outliers = (column - mean) / std_dev
    return np.abs(outliers) > threshold

# Define a function to remove rows with outliers in X
def remove_rows_with_outliers(X, y, threshold=3):
    outliers_mask = np.any(np.apply_along_axis(detect_outliers, 0, X, threshold=threshold), axis=1)
    return X[~outliers_mask], y[~outliers_mask]

X, y = remove_rows_with_outliers(X, y)

print("刪除有極端值的行後的資料集形狀：", X.shape)

  outliers = (column - mean) / std_dev


刪除有極端值的行後的資料集形狀： (1010378, 110)


In [9]:
# 用平均數填充缺失值

from sklearn.impute import SimpleImputer

print("填充前X的datatype", type(X))

# Create the imputer object with strategy='mean'
imputer = SimpleImputer(strategy='mean')

# Fit the imputer to X and transform X
X = imputer.fit_transform(X)

print("填充後X的datatype:", type(X))

填充前X的datatype <class 'pandas.core.frame.DataFrame'>
填充後X的datatype: <class 'numpy.ndarray'>


### Oversampling (smote)

In [10]:
# 過採樣: 增加target=1的data

from imblearn.over_sampling import SMOTE

# 初始化 SMOTE
smote = SMOTE(sampling_strategy='not majority')

# 使用 SMOTE 生成合成目標樣本
X, y = smote.fit_resample(X, y)

# 檢查過採樣後的資料量
print("過採樣後的資料量：", len(X))

# 檢查過採樣後的目標資料分佈
unique, counts = np.unique(y, return_counts=True)
print("過採樣後的目標資料分佈：", dict(zip(unique, counts)))

過採樣後的資料量： 1964182
過採樣後的目標資料分佈： {0: 982091, 1: 982091}


### 將數據劃分為訓練集和測試集

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
del X
del y

## 數據分析方法與模型選擇

### Model1. 梯度提升樹（Gradient Boosting Decision Tree，GBDT）
##### 優點：
##### 1. 高效性： LightGBM 是一種高效的梯度提升樹模型，具有優秀的訓練速度和預測速度。它使用了基於直方圖的方法來加速訓練過程，並且通常比其他梯度提升樹庫（如XGBoost）更快。
##### 2. 低內存使用： LightGBM 使用了一種稱為GOSS（Gradient-based One-Side Sampling）的方法，在訓練過程中減少了內存使用。這使得它可以處理大型數據集而不會出現內存不足的問題。
##### 3. 高準確性： LightGBM 在處理大型和高維數據集時通常表現出色，並且提供了很好的預測性能。
##### 4. 支持並行處理： LightGBM 支持並行處理，可以在多核 CPU 上進行訓練，從而加速了訓練過程。
##### 5. 自動處理類別特徵： LightGBM 可以自動處理類別特徵，無需額外的編碼。
##### 缺點：
##### 1. 對參數敏感： 與其他梯度提升樹模型一樣，LightGBM 的性能取決於參數的調整，對參數較為敏感，需要一些經驗和時間來進行調參。
##### 2. 需要較多數據： LightGBM 在處理小型數據集時可能不如其他模型表現好，因為它的高效性主要體現在處理大型數據集時

In [12]:
import lightgbm as lgb

# 創建 LightGBM 的訓練資料集
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
del X_train
del y_train

# 設置參數
params = {
    'boosting_type': 'gbdt',         # 使用梯度提升樹
    'objective': 'binary',           # 二元分類任務
    'metric': 'binary_error',        # 評估指標為二元分類錯誤率
    'num_leaves': 31,                # 每棵樹的最大葉子數量
    'learning_rate': 0.05,           # 學習速率
    'feature_fraction': 0.9,         # 特徵抽樣比例
    'bagging_fraction': 0.8,         # 樣本抽樣比例
    'bagging_freq': 5,               # 樣本抽樣的頻率
    'verbose': 0,                    # 顯示訓練信息
    'early_stopping_rounds': 10
}

# 訓練模型
num_round = 100                      # 迭代輪次
bst = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_train, lgb_test])

# 預測
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

### Model2. RNN
##### RNN的優點：
##### 1. 能夠處理序列數據，保留先前時間步的信息。
##### 2. 模型相對較簡單，易於理解和實現。
##### RNN的缺點：
##### 3. 容易出現梯度消失或梯度爆炸的問題，尤其在處理長序列數據時效果不佳。
##### 4. 難以捕捉長期依賴關係

In [13]:
# def focal_loss(y_true, y_pred, alpha=0.25, gamma=2.0):
#     # Calculate cross entropy loss
#     ce_loss = tf.keras.losses.binary_crossentropy(y_true, y_pred, from_logits=False)
    
#     # Calculate focal weights
#     p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
#     focal_weight = alpha * tf.pow(1 - p_t, gamma)
    
#     # Apply focal weights to cross entropy loss
#     focal_loss = focal_weight * ce_loss
    
#     # Return the mean loss over the batch
#     return tf.reduce_mean(focal_loss)

In [14]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.optimizers import AdamW
# # from sklearn.utils.class_weight import compute_class_weight

# # 將資料轉換成3D形狀
# X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
# X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# # 創建 Sequential 模型
# model = Sequential()

# # 添加第一層 SimpleRNN 層
# model.add(SimpleRNN(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
# model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# # 添加第二層 SimpleRNN 層
# model.add(SimpleRNN(50, return_sequences=True, activation='relu'))
# model.add(Dropout(0.3))

# # 添加第三層 SimpleRNN 層
# model.add(SimpleRNN(50, activation='relu'))
# model.add(Dropout(0.3))

# # 添加輸出層
# model.add(Dense(1, activation='sigmoid'))

# # 定義 AdamW 優化器，設置初始學習率和權重衰減
# # 嘗試不同的優化器 ：AdamW是Adam的一個變種，引入了一個額外的權重衰減（Weight Decay）項目，有助於減小參數的數值大小，從而減少過擬合的風險。
# # 調整學習率（Learning Rate）：嘗試不同的學習率值，有時降低學習率可以幫助模型更好地收斂，特別是在資料不平衡的情況下。
# optimizer = AdamW(learning_rate=0.001, weight_decay=1e-4)

# # 編譯模型
# # focal_loss 損失函數，可以自動為少數類別分配更大的權重
# model.compile(optimizer=optimizer, loss=focal_loss, metrics=['accuracy'])

# # 定義 EarlyStopping
# # 調整Early Stopping的參數：嘗試不同的patience值，以及monitor參數，這可能會影響到模型的停止訓練的時機。
# early_stopping = EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)

# # 設置類別加權，例如給予類別0權重1，給予類別1權重100
# # 1. 使用 np.unique(y_train) 函數獲取訓練數據中所有類別的標籤。
# # 2. 使用 compute_class_weight 函數計算每個類別的權重，參數 'balanced' 指定使用平衡策略。
# # 3. 將計算出的類別權重存儲在 class_weight 字典中
# # class_weight = compute_class_weight('balanced', np.unique(y_train), y_train)
# class_weight = {0: 1, 1: 100}

# # 訓練模型
# # 批量大小: 嘗試減小批量大小，可以讓模型在每個更新步驟中看到更多樣化的樣本，可能有助於緩解數據不平衡問題。
# history = model.fit(X_train, y_train, epochs=3, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping], class_weight=class_weight) # class_weight='balanced' not work

# # 評估
# model.evaluate(X_test, y_test)

# # 預測
# y_pred = model.predict(X_test)
# y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

### Model3. LSTM
##### LSTM的優點：
##### 1. 能夠有效地解決梯度消失或梯度爆炸的問題，能夠處理長序列數據。
##### 2. 能夠捕捉長期依賴關係，適用於需要考慮長期記憶的任務。
##### LSTM的缺點：
##### 1. 模型相對複雜，訓練時間較長。
##### 2. 需要調參和優化，有時可能會出現過擬合的問題。

In [15]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping

# # 將 DataFrame 轉換為 NumPy 陣列
# X_train = X_train.to_numpy()
# X_test = X_test.to_numpy()

# # 將資料轉換成3D形狀
# X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
# X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# # 創建 Sequential 模型
# model = Sequential()

# # 添加第一層 LSTM 層
# model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
# model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# # 添加第二層 LSTM 層
# model.add(LSTM(50, return_sequences=True, activation='relu'))
# model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# # 添加第三層 LSTM 層
# model.add(LSTM(50, activation='relu'))
# model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# # 添加輸出層
# model.add(Dense(1, activation='sigmoid'))

# # 編譯模型
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # 定義 EarlyStopping
# early_stopping = EarlyStopping(patience=5, monitor='val_loss', restore_best_weights=True)

# # 設置類別加權，例如給予類別0權重1，給予類別1權重100
# class_weight = {0: 1, 1: 100}

# # 訓練模型
# history = model.fit(X_train, y_train, epochs=1, batch_size=2048, validation_data=(X_test, y_test), callbacks=[early_stopping], class_weight=class_weight)

# # 預測
# y_pred = model.predict(X_test)
# y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

## 評估模型

In [16]:
from sklearn.metrics import accuracy_score, classification_report

# 評估模型
accuracy = accuracy_score(y_test, y_pred_binary)
print("準確率:", accuracy)

# 顯示分類報告
print("分類報告:")
print(classification_report(y_test, y_pred_binary))

準確率: 0.979984064637496
分類報告:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    196307
           1       1.00      0.96      0.98    196530

    accuracy                           0.98    392837
   macro avg       0.98      0.98      0.98    392837
weighted avg       0.98      0.98      0.98    392837



## Submission

In [17]:
# 測試資料的目錄
testDir = os.path.join(projDir, "csv_files/test/")

# 讀取測試數據

# base.csv
path = os.path.join(testDir, 'test_base.csv')
test_base = pd.read_csv(path)

# static(depth=0)
path = os.path.join(testDir, 'test_static_0_0.csv')
test_static0 = pd.read_csv(path)
path = os.path.join(testDir, 'test_static_0_1.csv')
test_static1 = pd.read_csv(path)
path = os.path.join(testDir, 'test_static_0_2.csv')
test_static2 = pd.read_csv(path)
test_static = pd.concat([test_static0, test_static1, test_static2], ignore_index=True)
del test_static0, test_static1, test_static2

# static_cb
path = os.path.join(testDir, 'test_static_cb_0.csv')
test_static_cb = pd.read_csv(path)

# Join Table
X_test = pd.merge(test_base, test_static, how="left", on="case_id")
X_test = pd.merge(X_test, test_static_cb, how="left", on="case_id")
del test_base, test_static, test_static_cb

# 刪非數col
for col in X_test.columns:
    if not (X_test[col].dtype in (np.int64, np.float64)):
        X_test = X_test.drop(col, axis=1)

In [18]:
# LGBM
y_test_pred = bst.predict(X_test, num_iteration=bst.best_iteration, predict_disable_shape_check=True)

# # RNN, LSTM
# X_test_array = X_test.to_numpy()
# X_test_rnn = X_test_array.reshape(X_test_array.shape[0], X_test_array.shape[1], 1)
# y_test_pred = model.predict(X_test_rnn)

# 轉成0或1
y_test_pred_binary = [1 if pred > 0.5 else 0 for pred in y_test_pred]
y_test_pred_binary

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
submission = pd.DataFrame({
    "case_id": X_test["case_id"].to_numpy(),
    "score": y_test_pred_binary
}).set_index('case_id')
submission.to_csv("./submission.csv")

pd.read_csv("./submission.csv")

Unnamed: 0,case_id,score
0,57543,0
1,57549,0
2,57551,0
3,57552,0
4,57569,0
5,57630,0
6,57631,0
7,57632,0
8,57633,0
9,57634,0
