In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 60)
import data_processing
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path

In [2]:
l_train_s = pd.read_csv('data_analytics_datagame/light_train_source_labels.csv', index_col='user_id')
l_train_t = pd.read_csv('data_analytics_datagame/light_train_target_labels.csv', index_col='user_id')
l_test_s = pd.read_csv('data_analytics_datagame/light_test_source_labels.csv', index_col='user_id')

slot_train_pr = pd.read_csv("slot_train_pr.csv", index_col="user_id")
slot_test_pr = pd.read_csv("slot_test_pr.csv", index_col="user_id")
slot_train_log = pd.read_csv("slot_train_log.csv", index_col="user_id")
slot_test_log = pd.read_csv("slot_test_log.csv", index_col="user_id")

# Fine-Tuning: Tree Structure

In [3]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, balanced_accuracy_score, average_precision_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split

In [30]:
X_train = pd.concat([l_train_s, slot_train_log.iloc[:,-112:]], axis=1)
y_train = l_train_t
X_test = pd.concat([l_test_s, slot_test_log.iloc[:,-112:]], axis=1)

In [None]:
model =  xgb.XGBClassifier(device='cuda', objective='binary:logistic', eval_metric='auc', scale_pos_weight=7)
param_dist = {
    "max_depth": [3,5,7],
    "min_child_weight": [1, 3, 5, 7, 9],
    "gamma": [0.1, 0.2, 0.3, 0.4, 0.5]
}
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    scoring="roc_auc",
    cv=3,
    verbose=3,
    random_state=42,
)
random_search.fit(X_train, y_train)
print("Best parameters from RandomizedSearchCV:", random_search.best_params_)
best_params = random_search.best_params_

# 縮小範圍，進行 Grid Search 微調
param_grid = {
    "max_depth": [best_params["max_depth"] - 1, best_params["max_depth"], best_params["max_depth"] + 1],
    "min_child_weight": [best_params["min_child_weight"] - 1, best_params["min_child_weight"], best_params["min_child_weight"] + 1],
    "gamma": [best_params["gamma"] - 0.05, best_params["gamma"], best_params["gamma"] + 0.05]
}
param_grid = {k: [v for v in vals if v >= 0] for k, vals in param_grid.items()}
grid_search = GridSearchCV(
    model, param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    verbose=3,
)
grid_search.fit(X_train, y_train)

In [7]:
print("Best parameters from GridSearchCV:", grid_search.best_params_)
best_model = grid_search.best_estimator_
# 預測與評估
y_pred_train = best_model.predict_proba(X_train)
train_auc = roc_auc_score(y_train, y_pred_train)
print(f"Final Model Train AUC: {train_auc:.4f}")

Best parameters from GridSearchCV: {'gamma': 0.05, 'max_depth': 2, 'min_child_weight': 8}
Final Model Train AUC: 0.8413


# Fine-Tuning: Learning Rate & N Estimators

In [None]:
# 使用第一階段找到的最佳參數
best_tree_params = {
    "max_depth": 2,
    "min_child_weight": 8,
    "gamma": 0.05,
    "scale_pos_weight": 7
}
# 定義搜尋範圍
param_grid = {
    "learning_rate": [0.1, 0.2, 0.3],
    "n_estimators": [300, 500, 1000]
}
# 設定 XGBoost 分類器
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    device="cuda",
    **best_tree_params  # 套用第一階段找到的最佳參數
)
# 進行 Grid Search
grid_search = GridSearchCV(
    xgb_model,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    verbose=3,
)
grid_search.fit(slot_train_pr, l_train_t)

In [None]:
# 輸出最佳參數
print("Best parameters from GridSearchCV:", grid_search.best_params_)
# 取得最佳模型
best_model = grid_search.best_estimator_
# 預測與評估
y_pred_train = best_model.predict_proba(slot_train_pr)
train_auc = roc_auc_score(y_train, y_pred_train)

print(f"Final Model Train AUC: {train_auc:.4f}")

Best parameters from GridSearchCV: {'learning_rate': 0.1, 'n_estimators': 300}
Final Model Train AUC: 0.8440


# Fine-Tuning: Learning Rate & N Estimators

In [None]:
# 使用第一階段找到的最佳參數
best_tree_eta_params = {
    'learning_rate': 0.1,
    'n_estimators': 300,
    "max_depth": 2,
    "min_child_weight": 8,
    "gamma": 0.05,
    "scale_pos_weight": 7
}

# 定義搜尋範圍
param_grid = {
    'subsample': [0.3, 0.5, 0.7],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'colsample_bylevel': [0.3, 0.5, 0.7]
}

# 設定 XGBoost 分類器
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    device="cuda",
    **best_tree_eta_params  # 套用第一階段找到的最佳參數
)
# 進行 Grid Search
grid_search = GridSearchCV(
    xgb_model,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    verbose=3,
)
grid_search.fit(slot_train_pr, l_train_t)

In [10]:
# 輸出最佳參數
print("Best parameters from GridSearchCV:", grid_search.best_params_)
# 取得最佳模型
best_model = grid_search.best_estimator_
# 預測與評估
y_pred_train = best_model.predict_proba(slot_train_pr)
train_auc = roc_auc_score(y_train, y_pred_train)

print(f"Final Model Train AUC: {train_auc:.4f}")

Best parameters from GridSearchCV: {'colsample_bylevel': 0.3, 'colsample_bytree': 0.3, 'subsample': 0.3}
Final Model Train AUC: 0.8322


In [74]:
X_train = slot_train_log
y_train = l_train_t
X_test = slot_test_log

In [79]:
best_params = {
    'learning_rate': 0.1,
    'n_estimators': 300,
    "max_depth": 2,
    "min_child_weight": 8,
    "gamma": 0.05,
    "scale_pos_weight": 7,
    'colsample_bylevel': 0.3,
    'colsample_bytree': 0.3,
    'subsample': 0.3
}

xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    device="cuda",
    **best_params  # 套用第一階段找到的最佳參數
)

xgb_model.fit(X_train, y_train)

y_pred_train = xgb_model.predict_proba(X_train)
train_auc = roc_auc_score(y_train, y_pred_train)
print(f"Final Model Train AUC: {train_auc:.4f}")

Final Model Train AUC: 0.8507


In [76]:
log_predict_train = pd.DataFrame(xgb_model.predict_proba(X_train).astype("float64"),
                               index=l_train_s.index,
                               columns=[f"time_slot_log{i}" for i in range(28)])

log_predict_test = pd.DataFrame(xgb_model.predict_proba(X_test).astype("float64"),
                               index=l_test_s.index,
                               columns=[f"time_slot_log{i}" for i in range(28)])

In [78]:
X_train = pd.concat([l_train_s, log_predict_train], axis=1)
y_train = l_train_t
X_test = pd.concat([l_test_s, log_predict_test], axis=1)

In [80]:
test_prediction = pd.DataFrame(xgb_model.predict_proba(X_test),
                               index=l_test_s.index,
                               columns=[f"time_slot_{i}" for i in range(28)])

In [81]:
test_prediction.to_csv('test_prediction.csv')