In [1]:
import numpy as np
import pandas as pd
import os

import polars as pl

## 資料探索與前處理

### 資料概覽
##### 列出主要的數據特徵及統計摘要

In [2]:
# 訓練資料的目錄
projDir = "/kaggle/input/home-credit-credit-risk-model-stability/"
trainDir = os.path.join(projDir, "csv_files/train/")

# 列出目錄下所有文件
files = os.listdir(trainDir)

# 依檔名排序
files = sorted(files)

# # 印出文件名稱
# for file in files:
#     print(file)
    
# 計算文件數量
num_files = len(files)
print("目錄下有", num_files, "個訓練資料文件")

del num_files

目錄下有 32 個訓練資料文件


In [3]:
# # 遍歷每個文件，僅顯示前幾行資料概況
# for file in files:
#     # 組合完整的檔案路徑
#     file_path = os.path.join(trainDir, file)
    
#     # 讀取文件的前幾行
#     df_head = pd.read_csv(file_path, nrows=3)
    
#     print("文件名稱:", file)
#     print("前3行資料:")
#     display(df_head)
#     print("\n")
    
del files

### 特徵工程
##### 提取、轉換或建立新特徵以優化模型

In [4]:
# 讀取訓練數據
train_base = pl.read_csv(trainDir + 'train_base.csv')

static_dfs = [
    pd.read_csv(trainDir + 'train_static_0_0.csv'),
    pd.read_csv(trainDir + 'train_static_0_1.csv')
]
pd_df = pd.concat(static_dfs, ignore_index=True)
del static_dfs
train_static = pl.from_pandas(pd_df)

train_static_cb = pl.read_csv(trainDir + 'train_static_cb_0.csv')
train_person_1 = pl.read_csv(trainDir + 'train_person_1.csv') 
train_credit_bureau_b_2 = pl.read_csv(trainDir + 'train_credit_bureau_b_2.csv') 

# Merge all dataframes using join with how='inner'
train_data = train_base.join(
    train_static, on='case_id', how='inner'
).join(
    train_static_cb, on='case_id', how='inner'
).join(
    train_person_1, on='case_id', how='inner'
).join(
    train_credit_bureau_b_2, on='case_id', how='inner'
)
del train_base
del train_static
del train_static_cb
del train_person_1
del train_credit_bureau_b_2

# 將目標變量和特徵變量分開
X = train_data.drop(columns=['target'])
y = train_data['target']
del train_data

print(X.dtypes)

  pd.read_csv(trainDir + 'train_static_0_0.csv'),
  pd.read_csv(trainDir + 'train_static_0_1.csv')


[Int64, String, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, String, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, String, Float64, Float64, String, String, String, Float64, Float64, Float64, String, Float64, String, Float64, Boolean, Boolean, String, String, Float64, Float64, String, Float64, Float64, Boolean, Boolean, Boolean, String, String, String, String, Float64, String, String, String, Float64, Float64, Float64, String, String, Float64, String, String, String, String, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float6

  X = train_data.drop(columns=['target'])


In [5]:
# # X存在非數字的column，無法輸入model訓練
# X.dtypes

In [6]:
for col in X.columns:
    if not (X[col].dtype in (pl.Int64, pl.Float64)):
        X = X.drop(col)

In [7]:
# X.dtypes

In [8]:
from sklearn.model_selection import train_test_split

# 將數據劃分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
del X
del y

## 數據分析方法與模型選擇

In [9]:
# 取得最後三十萬筆資料(ram不夠，sample不work)
X_train = X_train.tail(300000)
y_train = y_train.tail(300000)

# bst = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_train, lgb_test])
# TypeError: Wrong type(Series) for label. It should be list, numpy 1-D array or pandas Series
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [10]:
# # 看target值的分布
# np.unique(y_train, return_counts=True)

### Oversampling

In [11]:
# # 過取樣: 增加target=1的data

# from sklearn.impute import SimpleImputer
# from imblearn.over_sampling import SMOTE

# # 創建 SimpleImputer 對象，用於填補NaN值
# imputer = SimpleImputer(strategy='mean')

# # 填補NaN值
# X_train = imputer.fit_transform(X_train)

# # 設置合成目標樣本數量
# synthetic_target_count = 279555

# # 初始化 SMOTE
# smote = SMOTE(sampling_strategy={1: synthetic_target_count})

# # 使用 SMOTE 生成合成少數類樣本
# X_train, y_train = smote.fit_resample(X_train, y_train)

# # 檢查合成後的資料量
# print("合成後的資料量：", len(X_train))

# # 檢查合成後的目標資料分佈
# unique, counts = np.unique(y_train, return_counts=True)
# print("合成後的目標資料分佈：", dict(zip(unique, counts)))

### Undersampling

In [12]:
# 欠採樣: 減少target=0的data

from sklearn.utils import resample

# 將 DataFrame 轉換為 NumPy 陣列
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

# 隨機抽取與目標樣本數量相同的非目標樣本
X_train_0, y_train_0 = resample(X_train[y_train == 0], y_train[y_train == 0], replace=False, n_samples=np.sum(y_train == 1))

# 合併目標樣本和非目標樣本
X_train = np.concatenate((X_train_0, X_train[y_train == 1]), axis=0)
y_train = np.concatenate((y_train_0, y_train[y_train == 1]), axis=0)

# 檢查欠採樣後的資料量
print("欠採樣後的資料量：", len(X_train))

# 檢查欠採樣後的目標資料分佈
unique, counts = np.unique(y_train, return_counts=True)
print("欠採樣後的目標資料分佈：", dict(zip(unique, counts)))

欠採樣後的資料量： 40890
欠採樣後的目標資料分佈： {0: 20445, 1: 20445}


### Model1. 梯度提升樹（Gradient Boosting Decision Tree，GBDT）
##### 優點：
##### 1. 高效性： LightGBM 是一種高效的梯度提升樹模型，具有優秀的訓練速度和預測速度。它使用了基於直方圖的方法來加速訓練過程，並且通常比其他梯度提升樹庫（如XGBoost）更快。
##### 2. 低內存使用： LightGBM 使用了一種稱為GOSS（Gradient-based One-Side Sampling）的方法，在訓練過程中減少了內存使用。這使得它可以處理大型數據集而不會出現內存不足的問題。
##### 3. 高準確性： LightGBM 在處理大型和高維數據集時通常表現出色，並且提供了很好的預測性能。
##### 4. 支持並行處理： LightGBM 支持並行處理，可以在多核 CPU 上進行訓練，從而加速了訓練過程。
##### 5. 自動處理類別特徵： LightGBM 可以自動處理類別特徵，無需額外的編碼。
##### 缺點：
##### 1. 對參數敏感： 與其他梯度提升樹模型一樣，LightGBM 的性能取決於參數的調整，對參數較為敏感，需要一些經驗和時間來進行調參。
##### 2. 需要較多數據： LightGBM 在處理小型數據集時可能不如其他模型表現好，因為它的高效性主要體現在處理大型數據集時

In [13]:
# import lightgbm as lgb

# # 創建 LightGBM 的訓練資料集
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
# del X_train
# del y_train

# # 設置參數
# params = {
#     'boosting_type': 'gbdt',         # 使用梯度提升樹
#     'objective': 'binary',           # 二元分類任務
#     'metric': 'binary_error',        # 評估指標為二元分類錯誤率
#     'num_leaves': 31,                # 每棵樹的最大葉子數量
#     'learning_rate': 0.05,           # 學習速率
#     'feature_fraction': 0.9,         # 特徵抽樣比例
#     'bagging_fraction': 0.8,         # 樣本抽樣比例
#     'bagging_freq': 5,               # 樣本抽樣的頻率
#     'verbose': 0,                    # 顯示訓練信息
#     'early_stopping_rounds': 10
# }

# # 訓練模型
# num_round = 100                      # 迭代輪次
# bst = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_train, lgb_test])

# # 預測
# y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
# y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

### Model2. RNN
##### RNN的優點：
##### 1. 能夠處理序列數據，保留先前時間步的信息。
##### 2. 模型相對較簡單，易於理解和實現。
##### RNN的缺點：
##### 3. 容易出現梯度消失或梯度爆炸的問題，尤其在處理長序列數據時效果不佳。
##### 4. 難以捕捉長期依賴關係

In [14]:
def focal_loss(y_true, y_pred, alpha=0.25, gamma=2.0):
    # Calculate cross entropy loss
    ce_loss = tf.keras.losses.binary_crossentropy(y_true, y_pred, from_logits=False)
    
    # Calculate focal weights
    p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
    focal_weight = alpha * tf.pow(1 - p_t, gamma)
    
    # Apply focal weights to cross entropy loss
    focal_loss = focal_weight * ce_loss
    
    # Return the mean loss over the batch
    return tf.reduce_mean(focal_loss)

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import AdamW
# from sklearn.utils.class_weight import compute_class_weight

# 將資料轉換成3D形狀
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# 創建 Sequential 模型
model = Sequential()

# 添加第一層 SimpleRNN 層
model.add(SimpleRNN(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# 添加第二層 SimpleRNN 層
model.add(SimpleRNN(50, return_sequences=True, activation='relu'))
model.add(Dropout(0.3))

# 添加第三層 SimpleRNN 層
model.add(SimpleRNN(50, activation='relu'))
model.add(Dropout(0.3))

# 添加輸出層
model.add(Dense(1, activation='sigmoid'))

# 定義 AdamW 優化器，設置初始學習率和權重衰減
# 嘗試不同的優化器 ：AdamW是Adam的一個變種，引入了一個額外的權重衰減（Weight Decay）項目，有助於減小參數的數值大小，從而減少過擬合的風險。
# 調整學習率（Learning Rate）：嘗試不同的學習率值，有時降低學習率可以幫助模型更好地收斂，特別是在資料不平衡的情況下。
optimizer = AdamW(learning_rate=0.001, weight_decay=1e-4)

# 編譯模型
# focal_loss 損失函數，可以自動為少數類別分配更大的權重
model.compile(optimizer=optimizer, loss=focal_loss, metrics=['accuracy'])

# 定義 EarlyStopping
# 調整Early Stopping的參數：嘗試不同的patience值，以及monitor參數，這可能會影響到模型的停止訓練的時機。
early_stopping = EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)

# 設置類別加權，例如給予類別0權重1，給予類別1權重100
# 1. 使用 np.unique(y_train) 函數獲取訓練數據中所有類別的標籤。
# 2. 使用 compute_class_weight 函數計算每個類別的權重，參數 'balanced' 指定使用平衡策略。
# 3. 將計算出的類別權重存儲在 class_weight 字典中
# class_weight = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weight = {0: 1, 1: 100}

# 訓練模型
# 批量大小: 嘗試減小批量大小，可以讓模型在每個更新步驟中看到更多樣化的樣本，可能有助於緩解數據不平衡問題。
history = model.fit(X_train, y_train, epochs=3, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping], class_weight=class_weight) # class_weight='balanced' not work

# 評估
model.evaluate(X_test, y_test)

# 預測
y_pred = model.predict(X_test)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

2024-03-27 16:23:04.224581: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 16:23:04.224689: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 16:23:04.393990: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(**kwargs)


Epoch 1/3
[1m  2/320[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m25s[0m 79ms/step - accuracy: 0.5215 - loss: nan 

I0000 00:00:1711556606.190353      77 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 182ms/step - accuracy: 0.5001 - loss: nan - val_accuracy: 0.9315 - val_loss: nan
Epoch 2/3
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 153ms/step - accuracy: 0.4992 - loss: nan - val_accuracy: 0.9315 - val_loss: nan
Epoch 3/3
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 153ms/step - accuracy: 0.5014 - loss: nan - val_accuracy: 0.9315 - val_loss: nan
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 8ms/step - accuracy: 0.9320 - loss: nan
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 7ms/step


### Model3. LSTM
##### LSTM的優點：
##### 1. 能夠有效地解決梯度消失或梯度爆炸的問題，能夠處理長序列數據。
##### 2. 能夠捕捉長期依賴關係，適用於需要考慮長期記憶的任務。
##### LSTM的缺點：
##### 1. 模型相對複雜，訓練時間較長。
##### 2. 需要調參和優化，有時可能會出現過擬合的問題。

In [16]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping

# # 將 DataFrame 轉換為 NumPy 陣列
# X_train = X_train.to_numpy()
# X_test = X_test.to_numpy()

# # 將資料轉換成3D形狀
# X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
# X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# # 創建 Sequential 模型
# model = Sequential()

# # 添加第一層 LSTM 層
# model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
# model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# # 添加第二層 LSTM 層
# model.add(LSTM(50, return_sequences=True, activation='relu'))
# model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# # 添加第三層 LSTM 層
# model.add(LSTM(50, activation='relu'))
# model.add(Dropout(0.3))  # 添加 dropout 避免過擬合

# # 添加輸出層
# model.add(Dense(1, activation='sigmoid'))

# # 編譯模型
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # 定義 EarlyStopping
# early_stopping = EarlyStopping(patience=5, monitor='val_loss', restore_best_weights=True)

# # 設置類別加權，例如給予類別0權重1，給予類別1權重100
# class_weight = {0: 1, 1: 100}

# # 訓練模型
# history = model.fit(X_train, y_train, epochs=1, batch_size=2048, validation_data=(X_test, y_test), callbacks=[early_stopping], class_weight=class_weight)

# # 預測
# y_pred = model.predict(X_test)
# y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

## 評估模型

In [17]:
from sklearn.metrics import accuracy_score, classification_report

# 評估模型
accuracy = accuracy_score(y_test, y_pred_binary)
print("準確率:", accuracy)

# 顯示分類報告
print("分類報告:")
print(classification_report(y_test, y_pred_binary))

準確率: 0.9315008286400228
分類報告:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.93      1.00      0.96    536211
           1       0.00      0.00      0.00     39431

    accuracy                           0.93    575642
   macro avg       0.47      0.50      0.48    575642
weighted avg       0.87      0.93      0.90    575642



  _warn_prf(average, modifier, msg_start, len(result))


## Submission

In [18]:
submissionDir = os.path.join(projDir, "csv_files/test/")
submission_path = os.path.join(submissionDir, 'test_base.csv')
X_submission = pd.read_csv(submission_path)

X_submission = X_submission.drop('date_decision', axis=1)
X_submission

Unnamed: 0,case_id,MONTH,WEEK_NUM
0,57543,202201,100
1,57549,202201,100
2,57551,202201,100
3,57552,202201,100
4,57569,202201,100
5,57630,202201,100
6,57631,202201,100
7,57632,202201,100
8,57633,202201,100
9,57634,202201,100


In [19]:
# # LGBM
# y_submission_pred = bst.predict(X_submission, num_iteration=bst.best_iteration, predict_disable_shape_check=True)

# RNN, LSTM
X_submission_array = X_submission.to_numpy()
X_submission_rnn = X_submission_array.reshape(X_submission_array.shape[0], X_submission_array.shape[1], 1)
y_submission_pred = model.predict(X_submission_rnn)

# 轉成0或1
y_submission_pred_binary = [1 if pred > 0.5 else 0 for pred in y_submission_pred]
y_submission_pred_binary

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 830ms/step


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [20]:
submission = pd.DataFrame({
    "case_id": X_submission["case_id"].to_numpy(),
    "score": y_submission_pred_binary
}).set_index('case_id')
submission.to_csv("./submission.csv")

In [21]:
pd.read_csv("./submission.csv")

Unnamed: 0,case_id,score
0,57543,0
1,57549,0
2,57551,0
3,57552,0
4,57569,0
5,57630,0
6,57631,0
7,57632,0
8,57633,0
9,57634,0
