決策樹、梯度題詩數、極度提升樹訓練

下載相關模型指令

``` bash
pip install scikit-learn
pip install xgboost
pip install pyarrow
pip install fastparquet
```

引入相關模組


In [120]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from xgboost import XGBClassifier

import numpy as np
import pandas as pd

設立全域變數

In [121]:
historical_results = []
historical_feature = []

In [122]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)      

DATA_DIR = './storage/train_set.parquet'

test = pd.read_parquet(DATA_DIR)
       
test = test.drop(columns=['article_id', 'customer_id'])   
    
# for col in test.columns:
#     print(test[col].dtype)
#     if test[col].dtype == 'float16':
#         test[col] = test[col].astype('float32') 

# if 'label' in test.columns:
#     print("標籤分佈 (Label Counts):")
#     print(test['label'].value_counts())
    
# print("\n前 5 筆資料:")
# print(test[['trend', 'FN', 'Active', 'season_score']].head())

print(test.dtypes)

# print("\n所有欄位名稱:")
# print(test.columns.tolist())

label                           int8
days_last_buy                  int16
trend                        float16
user_cat_affinity            float16
user_color_affinity          float16
price_diff                   float16
FN                           float16
Active                       float16
club_member_status              int8
fashion_news_frequency          int8
age_group                       int8
product_type_name              int16
product_group_name              int8
colour_group_name               int8
index_group_name                int8
section_name                    int8
graphical_appearance_name       int8
price_group                     int8
season_score                 float16
dtype: object


合併並讀入測試資料 此為最後驗證

In [123]:
test_set_list = []

for i in range(28):
    file = pd.read_parquet(f'./storage/test_set_part_{i}.parquet')
    test_set_list.append(file)
    
test_set = pd.concat(test_set_list, ignore_index=True)
del test_set_list

# print(test_set.columns.tolist())

# y_test = test_set['label']
features_col = [c for c in test_set.columns if c not in ['label', 'article_id', 'customer_id']]

X_test = test_set[features_col]

合併並讀入驗證資料 此為訓練模型驗證用

In [None]:
valid__set_list = []

for i in range(28):
    file = pd.read_parquet(f'./storage/valid_set_part_{i}.parquet')
    valid__set_list.append(file)
    
valid_set = pd.concat(valid__set_list, ignore_index=True)

del valid__set_list

# print(valid_set.columns.tolist())

y_val = valid_set['label']
X_val = valid_set.drop(columns=['label','article_id', 'customer_id'])

['customer_id', 'article_id', 'days_last_buy', 'trend', 'user_cat_affinity', 'user_color_affinity', 'price_diff', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age_group', 'product_type_name', 'product_group_name', 'colour_group_name', 'index_group_name', 'section_name', 'graphical_appearance_name', 'price_group', 'season_score', 'label']


讀入訓練資料 並設定訓練變數(y)

In [125]:
data = './storage/train_set.parquet'

df_train = pd.read_parquet(data)

# print("資料欄位:", df.columns.tolist())

# if 'label' in df.columns:
#     print("標籤分佈 (Label Counts):")
#     print(df['label'].value_counts())

y_train = df_train['label']
X_train = df_train.drop(columns=['label','article_id', 'customer_id'])

# x_numeric = x.select_dtypes(include=['object'])
# print(x_numeric.head())

# x_numeric_train = X_train.select_dtypes(exclude=['object'])

建立決策樹模型並訓練

In [None]:
model_name = '決策樹'

model = DecisionTreeClassifier(
    class_weight='balanced',
    criterion='gini', # 樹分割的criterion有:“gini”, “entropy”, “log_loss”
    max_depth=10, #決策樹的深度
    min_samples_split=300, #內部節點至少需要多少資料數目，才去進行劃分
    min_samples_leaf=100, #每個最末節點(葉子)至少要有多少資料數目(samples)。
    random_state=42,  # Controls the randomness of the estimator.
)

# scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# results = cross_validate(modal, x_numeric, y, cv=5, scoring=scoring_metrics)

model.fit(X_train, y_train)

print('已完成')

建立隨機森林模型並訓練

In [None]:
model_name = '隨機森林'

model = RandomForestClassifier(
    n_estimators=200,          # 種 200 棵樹來投票
    class_weight='balanced',   # 維持平衡設定
    max_depth=12,              # 跟您決策樹的設定相近或稍深
    min_samples_leaf=50,       # 沿用您的抗雜訊設定
    min_samples_split=150,     
    n_jobs=-1,                 # 全力運算
    random_state=42
)

# scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# results = cross_validate(modal, x_numeric, y, cv=5, scoring=scoring_metrics)

model.fit(X_train, y_train)

print('已完成')

建立極度梯度提升樹模型並訓練

In [137]:
model_name = '極度梯度提升樹'

model = XGBClassifier(
    # --- 硬體加速 ---
    device='cuda',
    tree_method='hist',
    n_jobs=-1,
    random_state=42,
    
    # --- 核心參數 ---
    n_estimators=10000,    
    learning_rate=0.01,      
    max_depth=7,            
    
    # --- 抗噪與正則化 ---
    min_child_weight=100,   
    gamma=2.0,
    subsample=0.8,
    colsample_bytree=0.6,
    
    # --- 重要：不平衡處理 ---
    scale_pos_weight=1,  
    early_stopping_rounds=100,    
    
    # --- 進階功能 ---
    enable_categorical=True,
    eval_metric='auc',      
)

# scoring_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# results = cross_validate(modal, x_numeric, y, cv=5, scoring=scoring_metrics)

eval_set = [(X_train, y_train), (X_val, y_val)]
# modal.fit(
#     X_train, y_train,
#     eval_set=eval_set,
#     verbose=100              
# )

weights = np.ones(len(y_train), dtype=float)
weights[(y_train == 1) & (X_train['days_last_buy'] < 50)] = 35.0


model.fit(
    X_train, y_train,
    eval_set=eval_set,
    sample_weight=weights,
    verbose=True
)

print('已完成')

[0]	validation_0-auc:0.75678	validation_1-auc:0.64225
[1]	validation_0-auc:0.80159	validation_1-auc:0.67214
[2]	validation_0-auc:0.80936	validation_1-auc:0.68986
[3]	validation_0-auc:0.82676	validation_1-auc:0.68700
[4]	validation_0-auc:0.82911	validation_1-auc:0.68803
[5]	validation_0-auc:0.83192	validation_1-auc:0.68484
[6]	validation_0-auc:0.83202	validation_1-auc:0.68554
[7]	validation_0-auc:0.83076	validation_1-auc:0.68343
[8]	validation_0-auc:0.83787	validation_1-auc:0.68786
[9]	validation_0-auc:0.83650	validation_1-auc:0.68754
[10]	validation_0-auc:0.84667	validation_1-auc:0.68656
[11]	validation_0-auc:0.84942	validation_1-auc:0.68530
[12]	validation_0-auc:0.85319	validation_1-auc:0.68668
[13]	validation_0-auc:0.85388	validation_1-auc:0.68721
[14]	validation_0-auc:0.85333	validation_1-auc:0.68622
[15]	validation_0-auc:0.85246	validation_1-auc:0.68679
[16]	validation_0-auc:0.85592	validation_1-auc:0.68856
[17]	validation_0-auc:0.85494	validation_1-auc:0.68736
[18]	validation_0-au

加入特徵重要性排序

In [138]:
importances = model.feature_importances_
feature_names = X_train.columns

feature_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\n=== 特徵權重排行榜 (Top 15) ===")
print(feature_imp_df.head(15))



=== 特徵權重排行榜 (Top 15) ===
                      Feature  Importance
0               days_last_buy    0.334879
1                       trend    0.211172
4                  price_diff    0.064612
11         product_group_name    0.052097
14               section_name    0.051198
13           index_group_name    0.042818
16                price_group    0.037760
12          colour_group_name    0.037613
17               season_score    0.032920
10          product_type_name    0.031676
2           user_cat_affinity    0.029886
3         user_color_affinity    0.025913
15  graphical_appearance_name    0.024233
7          club_member_status    0.006502
8      fashion_news_frequency    0.004488


顯示模型訓練各項指標

In [139]:
y_val_proba = model.predict_proba(X_val)[:, 1]
y_val_pred = model.predict(X_val)

acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred)
rec = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
auc = roc_auc_score(y_val, y_val_proba)

print(classification_report(y_val, y_val_pred))

print("-" * 80)
print(f"正確率 (Accuracy):  {acc:.2%}")
print(f"精確率 (Precision): {prec:.2%}")
print(f"召回率 (Recall):    {rec:.2%}")
print(f"F1 分數 (F1-Score): {f1:.2%}")
print(f"AUC 分數:           {auc:.2%}")

              precision    recall  f1-score   support

           0       1.00      0.97      0.98  27347643
           1       0.00      0.21      0.01     14931

    accuracy                           0.97  27362574
   macro avg       0.50      0.59      0.49  27362574
weighted avg       1.00      0.97      0.98  27362574

--------------------------------------------------------------------------------
正確率 (Accuracy):  96.51%
精確率 (Precision): 0.33%
召回率 (Recall):    21.14%
F1 分數 (F1-Score): 0.66%
AUC 分數:           70.46%


In [140]:

row_data = {'Model': model_name}
top15_df = feature_imp_df.head(15).reset_index(drop=True)

for i, row in top15_df.iterrows():
    rank = i + 1
    content = f"{row['Feature']}: {row['Importance']:.4f}"
    row_data[rank] = content


historical_feature.append(row_data)

historical_results.append({
    'Model': model_name, 
    'Accuracy': f"{acc:.2%}",
    'Precision': f"{prec:.2%}",
    'Recall': f"{rec:.2%}",
    'F1-Score': f"{f1:.2%}",
    'AUC': f"{auc:.2%}",
})

In [142]:
new_df = pd.DataFrame(historical_results)

old_excel = pd.read_excel('modal_train_result.xlsx')
final_df = pd.concat([old_excel, new_df], ignore_index=True)

final_df.to_excel('modal_train_result.xlsx', index=False)

new_feature = pd.DataFrame(historical_feature)

old_feature = pd.read_excel('feature.xlsx')
final_feature = pd.concat([old_feature, new_feature], ignore_index=True)

final_feature.to_excel('feature.xlsx', index=False)

In [143]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

# 1. 取得預測機率 (0~1)
y_val_proba = model.predict_proba(X_val)[:, 1]

print(f"預測機率最大值: {y_val_proba.max():.4f}")
print(f"預測機率平均值: {y_val_proba.mean():.4f}")

# 2. 測試不同的門檻 (從 0.5 到 0.95)
thresholds = np.arange(0.05, 0.50, 0.01)
best_f1 = 0
best_threshold = 0.5

print("-" * 50)
print(f"{'Threshold':<10} | {'Precision':<10} | {'Recall':<10} | {'F1-Score':<10}")
print("-" * 50)

for thresh in thresholds:
    # 如果機率 > thresh 才算 1，否則算 0
    y_pred_custom = (y_val_proba > thresh).astype(int)
    
    prec = precision_score(y_val, y_pred_custom, zero_division=0)
    rec = recall_score(y_val, y_pred_custom, zero_division=0)
    f1 = f1_score(y_val, y_pred_custom, zero_division=0)
    
    print(f"{thresh:.2f}       | {prec:.2%}      | {rec:.2%}      | {f1:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print("-" * 50)
print(f"你的模型最佳門檻是: {best_threshold}")

預測機率最大值: 0.7594
預測機率平均值: 0.2052
--------------------------------------------------
Threshold  | Precision  | Recall     | F1-Score  
--------------------------------------------------
0.05       | 0.05%      | 100.00%      | 0.0011
0.06       | 0.05%      | 100.00%      | 0.0011
0.07       | 0.05%      | 100.00%      | 0.0011
0.08       | 0.05%      | 100.00%      | 0.0011
0.09       | 0.05%      | 100.00%      | 0.0011
0.10       | 0.05%      | 100.00%      | 0.0011
0.11       | 0.05%      | 100.00%      | 0.0011
0.12       | 0.05%      | 100.00%      | 0.0011
0.13       | 0.05%      | 100.00%      | 0.0011
0.14       | 0.05%      | 99.94%      | 0.0011
0.15       | 0.06%      | 98.63%      | 0.0011
0.16       | 0.06%      | 89.14%      | 0.0013
0.17       | 0.07%      | 79.40%      | 0.0014
0.18       | 0.09%      | 69.65%      | 0.0017
0.19       | 0.10%      | 64.60%      | 0.0020
0.20       | 0.11%      | 58.63%      | 0.0022
0.21       | 0.13%      | 53.73%      | 0.0026
0.22    

In [144]:
# 1. 取得預測分數 (不是 0/1，而是機率)
y_val_proba = model.predict_proba(X_val)[:, 1]

# 2. 建立成績單
df_result = pd.DataFrame({
    'real_buy': y_val,      # 實際有沒有買
    'prob': y_val_proba     # 模型給的分數
})

# 3. 關鍵動作：依照分數由高排到低
df_result = df_result.sort_values(by='prob', ascending=False)

# 4. 定義計算 Lift (提升度) 的函數
def calculate_lift(df, top_n_percent):
    # 取出前 N% 的人
    top_n_count = int(len(df) * (top_n_percent / 100))
    top_users = df.iloc[:top_n_count]
    
    # 這群人裡面，實際真的有買的有幾個？
    hits = top_users['real_buy'].sum()
    
    # 算出精準度 (Precision)
    precision = hits / top_n_count
    
    # 算出基準線 (隨便亂猜的命中率)
    baseline = df['real_buy'].mean()
    
    # 算出提升倍率
    lift = precision / baseline
    
    return precision, lift, hits, top_n_count, baseline

# --- 輸出最終報告 ---
prec, lift, hits, count, base = calculate_lift(df_result, 5) # 測試 Top 5%

print("=" * 40)
print(f"【 最終商業驗收報告 】")
print(f"AUC 分數: {auc:.2%}")
print("-" * 40)
print(f"如果我們對「分數最高的前 5%」客戶行銷：")
print(f"1. 行銷人數: {count} 人")
print(f"2. 成功抓到: {hits} 個購買者")
print(f"3. 名單精準度: {prec:.2%} (原本平均只有 {base:.2%})")
print(f"結論：模型讓行銷效率提升了 {lift:.2f} 倍！")
print("=" * 40)

【 最終商業驗收報告 】
AUC 分數: 70.46%
----------------------------------------
如果我們對「分數最高的前 5%」客戶行銷：
1. 行銷人數: 1368128 人
2. 成功抓到: 4828 個購買者
3. 名單精準度: 0.35% (原本平均只有 0.05%)
結論：模型讓行銷效率提升了 6.47 倍！


In [145]:
# --- 1. 準備預測資料 ---
# 假設 X_test 是你的測試集特徵，test_set 是包含 customer_id 和 article_id 的原始資料
# 務必確認 X_test 的欄位順序跟訓練時的 X_train 完全一樣！
print("正在預測全體測試資料分數...")
test_probs = model.predict_proba(X_test)[:, 1]

# 將分數合併回原始資料表
test_set['score'] = test_probs

# --- 2. 排序並取出每個人的 Top 12 ---
print("正在為每位顧客挑選 Top 12 商品...")

# 先依照 顧客(customer_id) 和 分數(score) 排序
# 顧客從小到大排，分數從大到小排 (這樣分數高的會在上面)
test_set = test_set.sort_values(['customer_id', 'score'], ascending=[True, False])

# 取出前 12 名
# 這裡使用 groupby + head(12) 來只保留每個人分數最高的 12 筆
top12_df = test_set.groupby('customer_id').head(12)

# --- 3. 整理成比賽要求的格式 (將 12 個商品接成一個字串) ---
print("正在格式化輸出...")

submission = top12_df.groupby('customer_id')['article_id'].apply(lambda x: ' '.join(['{:010d}'.format(int(item)) for item in x])).reset_index()
submission.columns = ['customer_id', 'prediction']

# --- 4. 處理不足 12 件的情況 (選用) ---
# 如果有些顧客只有 3 個候選商品，通常比賽要求一定要補滿 12 個
# 你可以用「最熱門商品 (Popular Items)」來填充
# 這裡先示範基本的，假設你候選名單夠多，或暫時不填充

# --- 5. 匯出 Excel ---
print("正在匯出 Excel...")
submission.to_excel("submission_top12.xlsx", index=False)
print("✅ 成功產生 submission_top12.xlsx")

正在預測全體測試資料分數...
正在為每位顧客挑選 Top 12 商品...
正在格式化輸出...
正在匯出 Excel...
✅ 成功產生 submission_top12.xlsx
