In [1]:
import numpy as np
import pandas as pd
import os

import polars as pl

## 資料探索與前處理

### 資料概覽
##### 列出主要的數據特徵及統計摘要

In [2]:
# 訓練資料的目錄
projDir = "/kaggle/input/home-credit-credit-risk-model-stability/"
trainDir = os.path.join(projDir, "csv_files/train/")

# 列出目錄下所有文件
files = os.listdir(trainDir)

# 依檔名排序
files = sorted(files)

# # 印出文件名稱
# for file in files:
#     print(file)
    
# 計算文件數量
num_files = len(files)
print("目錄下有", num_files, "個訓練資料文件")

del num_files

目錄下有 32 個訓練資料文件


In [3]:
# # 遍歷每個文件，僅顯示前幾行資料概況
# for file in files:
#     # 組合完整的檔案路徑
#     file_path = os.path.join(trainDir, file)
    
#     # 讀取文件的前幾行
#     df_head = pd.read_csv(file_path, nrows=3)
    
#     print("文件名稱:", file)
#     print("前3行資料:")
#     display(df_head)
#     print("\n")
    
del files

### 特徵工程
##### 提取、轉換或建立新特徵以優化模型

In [4]:
# 讀取訓練數據
train_base = pl.read_csv(trainDir + 'train_base.csv')

static_dfs = [
    pd.read_csv(trainDir + 'train_static_0_0.csv'),
    pd.read_csv(trainDir + 'train_static_0_1.csv')
]
pd_df = pd.concat(static_dfs, ignore_index=True)
del static_dfs
train_static = pl.from_pandas(pd_df)

train_static_cb = pl.read_csv(trainDir + 'train_static_cb_0.csv')
train_person_1 = pl.read_csv(trainDir + 'train_person_1.csv') 
train_credit_bureau_b_2 = pl.read_csv(trainDir + 'train_credit_bureau_b_2.csv') 

# Merge all dataframes using join with how='inner'
train_data = train_base.join(
    train_static, on='case_id', how='inner'
).join(
    train_static_cb, on='case_id', how='inner'
).join(
    train_person_1, on='case_id', how='inner'
).join(
    train_credit_bureau_b_2, on='case_id', how='inner'
)
del train_base
del train_static
del train_static_cb
del train_person_1
del train_credit_bureau_b_2

# 將目標變量和特徵變量分開
X = train_data.drop(columns=['target'])
y = train_data['target']
del train_data

print(X.dtypes)

  pd.read_csv(trainDir + 'train_static_0_0.csv'),
  pd.read_csv(trainDir + 'train_static_0_1.csv')


[Int64, String, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, String, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, String, Float64, Float64, String, String, String, Float64, Float64, Float64, String, Float64, String, Float64, Boolean, Boolean, String, String, Float64, Float64, String, Float64, Float64, Boolean, Boolean, Boolean, String, String, String, String, Float64, String, String, String, Float64, Float64, Float64, String, String, Float64, String, String, String, String, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float6

  X = train_data.drop(columns=['target'])


In [5]:
# # X存在非數字的數字，無法輸入model訓練
# X.dtypes

In [6]:
for col in X.columns:
    if not (X[col].dtype in (pl.Int64, pl.Float64)):
        X = X.drop(col)

In [7]:
# X.dtypes

In [8]:
from sklearn.model_selection import train_test_split

# 將數據劃分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
del X
del y

## 數據分析方法與模型選擇

### 梯度提升樹（Gradient Boosting Decision Tree，GBDT）

In [9]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# 取得最後三十萬筆資料(ram不夠，sample不work)
X_train = X_train.tail(300000)
y_train = y_train.tail(300000)

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [10]:
# 看target值的分布
np.unique(y_train, return_counts=True)

(array([0, 1]), array([279555,  20445]))

In [11]:
# 過取樣: 增加target=1的data

from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# 創建 SimpleImputer 對象，用於填補NaN值
imputer = SimpleImputer(strategy='mean')

# 填補NaN值
X_train = imputer.fit_transform(X_train)

# 設置合成目標樣本數量
synthetic_target_count = 279555

# 初始化 SMOTE
smote = SMOTE(sampling_strategy={1: synthetic_target_count})

# 使用 SMOTE 生成合成少數類樣本
X_train, y_train = smote.fit_resample(X_train, y_train)

# 檢查合成後的資料量
print("合成後的資料量：", len(X_train))

# 檢查合成後的目標資料分佈
unique, counts = np.unique(y_train, return_counts=True)
print("合成後的目標資料分佈：", dict(zip(unique, counts)))

合成後的資料量： 559110
合成後的目標資料分佈： {0: 279555, 1: 279555}


In [12]:
# 創建 LightGBM 的訓練資料集
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
del X_train
del y_train

# 設置參數
params = {
    'boosting_type': 'gbdt',         # 使用梯度提升樹
    'objective': 'binary',           # 二元分類任務
    'metric': 'binary_error',        # 評估指標為二元分類錯誤率
    'num_leaves': 31,                # 每棵樹的最大葉子數量
    'learning_rate': 0.05,           # 學習速率
    'feature_fraction': 0.9,         # 特徵抽樣比例
    'bagging_fraction': 0.8,         # 樣本抽樣比例
    'bagging_freq': 5,               # 樣本抽樣的頻率
    'verbose': 0,                    # 顯示訓練信息
    'early_stopping_rounds': 10
}

In [13]:
# 訓練模型
num_round = 100                      # 迭代輪次
bst = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_train, lgb_test])

In [14]:
# 預測
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

# 評估模型
accuracy = accuracy_score(y_test, y_pred_binary)
print("準確率:", accuracy)

# 顯示分類報告
print("分類報告:")
print(classification_report(y_test, y_pred_binary))



準確率: 0.9024463121176008
分類報告:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95    536211
           1       0.29      0.28      0.28     39431

    accuracy                           0.90    575642
   macro avg       0.62      0.61      0.62    575642
weighted avg       0.90      0.90      0.90    575642



## Submission

In [15]:
# 讀取數據
submissionDir = os.path.join(projDir, "csv_files/test/")
submission_base = pl.read_csv(submissionDir + 'test_base.csv')

static_dfs = [
    pd.read_csv(submissionDir + 'test_static_0_0.csv'),
    pd.read_csv(submissionDir + 'test_static_0_1.csv')
]
pd_df = pd.concat(static_dfs, ignore_index=True)
del static_dfs
submission_static = pl.from_pandas(pd_df)
del pd_df

submission_static_cb = pl.read_csv(submissionDir + 'test_static_cb_0.csv')
submission_person_1 = pl.read_csv(submissionDir + 'test_person_1.csv') 
submission_credit_bureau_b_2 = pl.read_csv(submissionDir + 'test_credit_bureau_b_2.csv') 

# Merge all dataframes using join with how='inner'
X_submission = submission_base.join(
    submission_static, on='case_id', how='inner'
).join(
    submission_static_cb, on='case_id', how='inner'
).join(
    submission_person_1, on='case_id', how='inner'
)
# .join(
#     submission_credit_bureau_b_2, on='case_id', how='inner'
# )

del submission_base
del submission_static
del submission_static_cb
del submission_person_1
del submission_credit_bureau_b_2

# Drop datatype非數字的data
for col in X_submission.columns:
    if not (X_submission[col].dtype in (pl.Int64, pl.Float64)):
        X_submission = X_submission.drop(col)
        
X_submission

case_id,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,validfrom_1069D,contractssum_5085716L,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,firstquarter_103L,fourthquarter_440L,numberofqueries_373L,pmtaverage_4955615A,pmtcount_4955617L,secondquarter_766L,thirdquarter_1082L,childnum_185L,mainoccupationinc_384A,num_group1,personindex_1023L,persontype_1072L,persontype_792L
i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64
57543,202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.35417,0.0,0.32292,0.07292,0.05208,6.0,0.0,0.0,0.0,0.0,0.0,5.0,12154.4,12154.4,12154.4,456031.1,17859.6,,151364.0,2.0,4.0,1.0,8.0,2.0,4.0,9.0,8.0,,,2.0,3.0,,34000.0,0,0.0,1.0,1.0
57543,202201,100,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.35417,0.0,0.32292,0.07292,0.05208,6.0,0.0,0.0,0.0,0.0,0.0,5.0,12154.4,12154.4,12154.4,456031.1,17859.6,,151364.0,2.0,4.0,1.0,8.0,2.0,4.0,9.0,8.0,,,2.0,3.0,,,1,1.0,5.0,5.0
57549,202201,100,0.0,129704.4,5742.6,3546.6,0.0,2.0,0.0,0.0,0.0,10.0,0.0,0.0,-1.0,0.0,32426.201,118964.805,0.0,13681.714,32426.201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.34091,0.11628,0.29545,0.18605,0.13953,18.0,0.0,0.0,0.0,,2.0,7.0,10638.2,10638.2,10638.2,373720.84,126058.0,,1563100.0,6.0,9.0,3.0,12.0,4.0,9.0,5.0,12.0,26815.6,14.0,8.0,2.0,,49800.0,0,0.0,1.0,1.0
57549,202201,100,0.0,129704.4,5742.6,3546.6,0.0,2.0,0.0,0.0,0.0,10.0,0.0,0.0,-1.0,0.0,32426.201,118964.805,0.0,13681.714,32426.201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.34091,0.11628,0.29545,0.18605,0.13953,18.0,0.0,0.0,0.0,,2.0,7.0,10638.2,10638.2,10638.2,373720.84,126058.0,,1563100.0,6.0,9.0,3.0,12.0,4.0,9.0,5.0,12.0,26815.6,14.0,8.0,2.0,,,1,1.0,5.0,5.0
57551,202201,100,0.0,71036.4,2844.6,0.0,0.0,1.0,0.0,0.0,0.0,2.0,-1.0,,-1.0,1.0,8357.2,,1.0,0.0,9551.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.33333,0.0,0.11111,0.11111,0.0,12.0,,,,27095.201,1.0,2.0,0.0,0.0,0.0,75219.0,,,2926195.3,1.0,3.0,1.0,4.0,1.0,3.0,2.0,4.0,,,5.0,5.0,,59600.0,0,0.0,1.0,1.0
57552,202201,100,0.0,183992.0,6298.8003,12155.4,0.0,0.0,0.0,0.0,0.0,9.0,-9.0,-7.0,-9.0,0.0,7440.4,,0.0,199322.4,9148.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.83871,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,,0.0,6.0,191269.61,191269.61,191269.61,284213.0,18889.0,,747031.73,2.0,2.0,0.0,5.0,0.0,3.0,2.0,5.0,23402.8,14.0,7.0,1.0,,112000.0,0,0.0,1.0,1.0
57569,202201,100,0.0,0.0,4682.6,0.0,0.0,1.0,0.0,0.0,0.0,6.0,2824.0,,2824.0,2517.0,,,,,10796.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.33333,0.6,0.66667,0.66667,0.6,24.0,0.0,,,,2.0,3.0,0.0,0.0,0.0,95348.42,,,,4.0,4.0,1.0,4.0,4.0,0.0,0.0,4.0,17333.6,14.0,1.0,3.0,0.0,,1,1.0,5.0,5.0
57569,202201,100,0.0,0.0,4682.6,0.0,0.0,1.0,0.0,0.0,0.0,6.0,2824.0,,2824.0,2517.0,,,,,10796.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.33333,0.6,0.66667,0.66667,0.6,24.0,0.0,,,,2.0,3.0,0.0,0.0,0.0,95348.42,,,,4.0,4.0,1.0,4.0,4.0,0.0,0.0,4.0,17333.6,14.0,1.0,3.0,,58000.0,0,0.0,1.0,1.0
57630,202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.42857,0.0,0.28571,0.0,0.0,12.0,0.0,0.0,0.0,96174.0,0.0,1.0,0.0,0.0,0.0,9677.601,,,499975.0,1.0,2.0,1.0,5.0,1.0,1.0,3.0,5.0,,,4.0,1.0,,60000.0,0,0.0,1.0,1.0
57630,202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.42857,0.0,0.28571,0.0,0.0,12.0,0.0,0.0,0.0,96174.0,0.0,1.0,0.0,0.0,0.0,9677.601,,,499975.0,1.0,2.0,1.0,5.0,1.0,1.0,3.0,5.0,,,4.0,1.0,,,1,1.0,4.0,4.0


In [16]:
y_submission_pred = bst.predict(X_submission, num_iteration=bst.best_iteration, predict_disable_shape_check=True)
y_submission_pred_binary = [1 if pred > 0.5 else 0 for pred in y_submission_pred]
y_submission_pred_binary



[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [17]:
submission = pd.DataFrame({
    "case_id": X_submission["case_id"].to_numpy(),
    "score": y_submission_pred_binary
}).set_index('case_id')
submission.to_csv("./submission.csv")

In [18]:
pd.read_csv("./submission.csv")

Unnamed: 0,case_id,score
0,57543,0
1,57543,0
2,57549,0
3,57549,0
4,57551,0
5,57552,0
6,57569,0
7,57569,0
8,57630,0
9,57630,0
