# Background
- **Author**: `<郭伊軒>`
- **Created At**: `<2025-11-1>`
- **Path to Training Data： discount-timing-DE.csv**
- **Path to Testing Data： discount-timing-DE.csv**
- **Model Specification 
  - Method：XGBoost classifer
  - Variables：   
    ['Age','AccumulatedPositiveRate', 'MultiPlayer', 'SalePeriod', 'DiscountFreq3M',    
    'PlayerGrowthRate1W_lag0', 'PlayerGrowthRate1W_lag7', 'PlayerGrowthRate1W_lag14',   
    'FollowersGrowthRate1W_lag0', 'FollowersGrowthRate1W_lag7', 'FollowersGrowthRate1W_lag14',   
    'PositiveRateGrowthRate1W_lag0', 'PositiveRateGrowthRate1W_lag7', 'PositiveRateGrowthRate1W_lag14',   
    'DLC_sum_1W_lag0', 'DLC_sum_1W_lag7', 'DLC_sum_1W_lag14',   
    'Sequel_sum_1W_lag0', 'Sequel_sum_1W_lag7', 'Sequel_sum_1W_lag14']
  - Tuning Parameters：    
    ['n_estimators', 'max_depth', 'learning_rate', 'min_child_weight', gamma', 'subsample', 'colsample_bytree',          'scale_pos_weight']
  - Optimization Method：  
    - 非季節折扣  
      {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 96.25, 'subsample': 1.0}
    - 季節折扣  
      {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 100, 'scale_pos_weight': 94.62011173184358, 'subsample': 0.8}
    
- **Main Findings and Takeaways：**
  - In-sample `<AUC>`:   
    DiscountOutOfSale(`0.9273`), DiscountDuringSale(`0.9733`)
  - Out-sample `<AUC>`:  
    DiscountOutOfSale(`0.7628`), DiscountDuringSale(`0.9720`)
  - Feature Importance Ranking:
    - 非季節折扣  
      | 1 | DiscountFreq3M  
      | 2 | SalePeriod   
      | 3 | PlayerGrowthRate1W_lag0  
      | 4 | PlayerGrowthRate1W_lag14  
      | 5 | FollowersGrowthRate1W_lag0      
    - 季節折扣  
      | 1 | SalePeriod  
      | 2 | PlayerGrowthRate1W_lag0   
      | 3 | DiscountFreq3M  
      | 4 | PositiveRateGrowthRate1W_lag0  
      | 5 | PlayerGrowthRate1W_lag14         
- **Future Direciton：**

### Pre-processing

In [48]:
# Load packages here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, make_scorer
from sklearn.inspection import permutation_importance


In [49]:
# Load the TRAINING data here and please finish all the data manipulation here.
#input_data_file = "/Users/10610/Desktop/114-1 資料/steam-project/discount-timing-DE.csv"
input_data_file = "/Users/user/Desktop/114-1 資料/steam-project/discount-timing-DE.csv"
df = pd.read_csv(input_data_file)
df_dummies = pd.get_dummies(df, columns=['GameID'], drop_first=True)
df_dummies.dropna(inplace=True)

train = df_dummies[df_dummies['Date'] < '2025-01-01']
test = df_dummies[df_dummies['Date'] >= '2025-01-01']

def prepare_xy(df, feature_cols, target_col):
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    # 將 bool 欄轉成 int
    X = X.astype({col: 'int' for col in X.select_dtypes(bool).columns})
    return X, y


In [50]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GameID,23938.0,461376.742,298559.181056,10.0,244850.0,431730.0,644930.0,1145360.0
MultiPlayer,23938.0,0.464241,0.49873,0.0,0.0,0.0,1.0,1.0
ConstantDiscount,23938.0,0.214387,0.410405,0.0,0.0,0.0,0.0,1.0
DiscountOrNot,23938.0,0.019885,0.139607,0.0,0.0,0.0,0.0,1.0
DiscountDuration,23938.0,0.221196,1.715483,0.0,0.0,0.0,0.0,32.0
DiscountFreq3M,23938.0,1.797644,1.043279,0.0,1.0,2.0,3.0,6.0
Age,23938.0,7.634427,4.458471,2.389041,4.95137,6.323288,8.479452,24.84658
AccumulatedPositiveRate,23938.0,0.928061,0.064186,0.738751,0.905517,0.953165,0.972651,0.9929734
SalePeriod,23938.0,0.14642,0.353534,0.0,0.0,0.0,0.0,1.0
DiscountDuringSale,23938.0,0.008647,0.09259,0.0,0.0,0.0,0.0,1.0


## function

In [69]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    '''fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # --- 第一張圖：Weight (使用頻率) ---
    xgb.plot_importance(model, importance_type='weight', ax=axes[0])
    axes[0].set_title("Feature Importance (Weight/Frequency)")
    axes[0].set_xlabel("Frequency (Count)")

    # --- 第二張圖：Gain (增益/影響力) ---
    xgb.plot_importance(model, importance_type='gain', ax=axes[1])
    axes[1].set_title("Feature Importance (Gain/Impact)")
    axes[1].set_xlabel("Gain (Average improvement)")

    plt.tight_layout() # 自動調整佈局，避免標籤重疊
    plt.show()'''



    y_pred_train = model.predict(X_train)
    y_prob_train = model.predict_proba(X_train)[:, 1]

    y_pred_test = model.predict(X_test)
    y_prob_test = model.predict_proba(X_test)[:, 1]

    acc_train = accuracy_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_prob_train)

    acc_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    auc_test = roc_auc_score(y_test, y_prob_test)
    cm = confusion_matrix(y_test, y_pred_test)

    results = {
        'Accuracy': [round(acc_train, 4), round(acc_test, 4)],
        'F1 score': [round(f1_train, 4), round(f1_test, 4)],
        'AUC': [round(auc_train, 4), round(auc_test, 4)]
    }

    row_names = ['train', 'test']

    result = pd.DataFrame(results, index=row_names)


    print(f"\n=== {name} ===")
    print("Confusion matrix:\n", cm)
    return result


In [52]:
def find_best_params_grid_searchCV(X_train, y_train, X_test, y_test, param_grid):
    # 1. 初始化 XGBClassifier
    xgb_clf = XGBClassifier(
        random_state=71, 
        objective="binary:logistic", 
        use_label_encoder=False, 
        eval_metric='logloss' # 設置一個預設的評估指標以避免警告
    )
    
    # 2. 時間序列交叉驗證
    tscv = TimeSeriesSplit(n_splits=5)

    # 3. 定義評分標準
    scorer = make_scorer(roc_auc_score, needs_proba=True)
    
    # 4. 初始化 GridSearchCV
    grid_search = GridSearchCV(
        estimator=xgb_clf,
        param_grid=param_grid,
        scoring=scorer,       # 使用定義好的評分標準
        cv=tscv,               # 使用分層交叉驗證
        verbose=1,            # 顯示進度
        n_jobs=-1             # 使用所有可用的 CPU 核心進行並行計算
    )
    
    # 5. 執行網格搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)

    y_prob_train = best_model.predict_proba(X_train)[:, 1]
    y_prob_test = best_model.predict_proba(X_test)[:, 1]


    acc_train = accuracy_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_prob_train)

    acc_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    auc_test = roc_auc_score(y_test, y_prob_test)


    results = {
        'Accuracy': [round(acc_train, 4), round(acc_test, 4)],
        'F1 score': [round(f1_train, 4), round(f1_test, 4)],
        'AUC': [round(auc_train, 4), round(auc_test, 4)]
    }

   
    row_names = ['train', 'test']

    result = pd.DataFrame(results, index=row_names)
    
    # 返回最佳模型
    return grid_search.best_params_, result



# 1W

In [53]:
feature_cols = [
    'Age', 'SalePeriod', 'AccumulatedPositiveRate', "MultiPlayer", 'DiscountFreq3M', 
    'PlayerGrowthRate1W_lag0', 'PlayerGrowthRate1W_lag7', 'PlayerGrowthRate1W_lag14',
    'FollowersGrowthRate1W_lag0', 'FollowersGrowthRate1W_lag7', 'FollowersGrowthRate1W_lag14',
    'PositiveRateGrowthRate1W_lag0', 'PositiveRateGrowthRate1W_lag7', 'PositiveRateGrowthRate1W_lag14',
    'DLC_sum_1W_lag0', 'DLC_sum_1W_lag7', 'DLC_sum_1W_lag14',
    'Sequel_sum_1W_lag0', 'Sequel_sum_1W_lag7', 'Sequel_sum_1W_lag14'
] + [col for col in df_dummies.columns if col.startswith('GameID_')]

baseline_model = XGBClassifier( 
    random_state=71, 
    objective="binary:logistic", 
    use_label_encoder=False, 
    eval_metric='logloss'
)

### 非季節性折扣

In [54]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountOutOfSale')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountOutOfSale')

#### 調參數

In [55]:
pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.1, 0.2],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [pos_weight]
}

best_param, result = find_best_params_grid_searchCV(X_train, y_train, X_test, y_test, param_grid)
print(result)
print(best_param)



Fitting 5 folds for each of 720 candidates, totalling 3600 fits
       Accuracy  F1 score     AUC
train    0.7946    0.0901  0.9486
test     0.6961    0.0556  0.7627
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 96.25, 'subsample': 1.0}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


#### 用Gain Importance過濾變數

In [None]:
best_xgb = XGBClassifier( 
    random_state=71, 
    objective="binary:logistic",  
    eval_metric='auc',
    n_estimators = 100,
    max_depth = 3,
    min_child_weight = 2,
    learning_rate = 0.1,
    gamma = 0,
    subsample = 1,
    colsample_bytree = 0.8,
    scale_pos_weight = pos_weight
)

best_xgb.fit(X_train, y_train)

booster = best_xgb.get_booster()
gain_score = booster.get_score(importance_type='gain')

gain_df = pd.DataFrame({
    "feature": list(gain_score.keys()),
    "gain": list(gain_score.values())
}).sort_values(by="gain", ascending=False)

gain_threshold = 0.001

selected_gain = gain_df[gain_df['gain'] > gain_threshold]['feature'].tolist()

print("=== Gain Importance ===")
print(gain_df)

=== Gain Importance ===
                           feature        gain
1                       SalePeriod  635.675720
3                   DiscountFreq3M  481.303650
9      FollowersGrowthRate1W_lag14  208.823654
7       FollowersGrowthRate1W_lag0  206.195236
6         PlayerGrowthRate1W_lag14  198.026489
4          PlayerGrowthRate1W_lag0  182.768723
5          PlayerGrowthRate1W_lag7  177.648041
2          AccumulatedPositiveRate  155.686554
12  PositiveRateGrowthRate1W_lag14  150.847000
11   PositiveRateGrowthRate1W_lag7  143.248367
10   PositiveRateGrowthRate1W_lag0  140.089920
8       FollowersGrowthRate1W_lag7  135.298141
14                     GameID_4000  131.549103
15                   GameID_244850  122.915207
17                   GameID_582660  110.958168
0                              Age  104.469177
19                   GameID_814380   71.842880
13                 DLC_sum_1W_lag0   48.424881
18                   GameID_644930   44.885479
16                   GameID_431960  

#### 排序變數重要程度

In [None]:
best_xgb.fit(X_train[selected_gain], y_train)

r = permutation_importance(
    best_xgb, X_train[selected_gain], y_train,
    scoring='roc_auc',
    n_repeats=10,
    random_state=42
)

perm_df = pd.DataFrame({
    "feature": selected_gain,
    "importance": r.importances_mean
}).sort_values(by="importance", ascending=False)

perm_selected = perm_df[perm_df["importance"] > 0]["feature"].tolist()

print("=== Permutation Importance ===")
print(perm_df)

=== Permutation Importance ===
                           feature    importance
1                   DiscountFreq3M  2.277777e-01
0                       SalePeriod  1.098838e-01
5          PlayerGrowthRate1W_lag0  4.228646e-02
4         PlayerGrowthRate1W_lag14  2.981507e-02
3       FollowersGrowthRate1W_lag0  2.751266e-02
6          PlayerGrowthRate1W_lag7  2.724563e-02
10   PositiveRateGrowthRate1W_lag0  2.206083e-02
8   PositiveRateGrowthRate1W_lag14  1.461441e-02
15                             Age  1.430845e-02
9    PositiveRateGrowthRate1W_lag7  1.278703e-02
11      FollowersGrowthRate1W_lag7  1.073149e-02
7          AccumulatedPositiveRate  1.049411e-02
2      FollowersGrowthRate1W_lag14  6.159037e-03
13                   GameID_244850  3.252690e-03
16                   GameID_814380  1.674593e-03
17                 DLC_sum_1W_lag0  2.067290e-04
19                   GameID_431960  5.551115e-17
12                     GameID_4000  0.000000e+00
14                   GameID_582660  0.

#### 刪除多餘的變數

In [67]:
tscv = TimeSeriesSplit(n_splits=5)

def get_auc(cols):
    return cross_val_score(
        best_xgb,
        X_train[cols], y_train,
        cv=tscv, scoring='roc_auc'
    ).mean()

auc_base = get_auc(perm_selected)
print("baseline AUC:", auc_base)

retain = []
auc_threshold = 0.003

for col in perm_selected:
    reduced = [c for c in perm_selected if c != col]
    auc = cross_val_score(best_xgb, X_train[reduced], y_train, cv=tscv, scoring='roc_auc').mean()

    if auc_base - auc >= auc_threshold:
        retain.append(col)

print("Keep after AUC drop test:", retain)


baseline AUC: 0.7506076536144286
Keep after AUC drop test: ['DiscountFreq3M', 'SalePeriod', 'PlayerGrowthRate1W_lag0']


#### 模型效果

In [70]:
X_train_final = X_train[retain]
X_test_final = X_test[retain]

result1 = evaluate_model('baseline', baseline_model, X_train_final, y_train, X_test_final, y_test)
result2 = evaluate_model('selection', best_xgb, X_train_final, y_train, X_test_final, y_test)


combined_results = pd.concat([result1, result2], keys=['baseline', 'selection'])
print("\n模型比較結果:")
print(combined_results)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== baseline ===
Confusion matrix:
 [[6729    0]
 [  93    0]]

=== selection ===
Confusion matrix:
 [[4263 2466]
 [  24   69]]

模型比較結果:
                 Accuracy  F1 score     AUC
baseline  train    0.9898    0.0113  0.9348
          test     0.9864    0.0000  0.7132
selection train    0.7246    0.0617  0.8786
          test     0.6350    0.0525  0.7576


### 季節性折扣

In [71]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountDuringSale')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountDuringSale')

#### 調參數

In [72]:
pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.1, 0.2],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [pos_weight]
}

best_param, result = find_best_params_grid_searchCV(X_train, y_train, X_test, y_test, param_grid)
print(result)
print(best_param)



Fitting 5 folds for each of 720 candidates, totalling 3600 fits
       Accuracy  F1 score     AUC
train    0.9465    0.2810  0.9952
test     0.9483    0.1108  0.9708
{'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 100, 'scale_pos_weight': 94.62011173184358, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


#### 用Gain Importance過濾變數

In [73]:
best_xgb = XGBClassifier( 
    random_state=71, 
    objective="binary:logistic",  
    eval_metric='auc',
    n_estimators = 100,
    max_depth = 4,
    min_child_weight = 3,
    learning_rate = 0.1,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,
    scale_pos_weight = pos_weight
)

best_xgb.fit(X_train, y_train)

booster = best_xgb.get_booster()
gain_score = booster.get_score(importance_type='gain')

gain_df = pd.DataFrame({
    "feature": list(gain_score.keys()),
    "gain": list(gain_score.values())
}).sort_values(by="gain", ascending=False)

gain_threshold = 0.001

selected_gain = gain_df[gain_df['gain'] > gain_threshold]['feature'].tolist()

print("=== Gain Importance ===")
print(gain_df)

=== Gain Importance ===
                           feature         gain
1                       SalePeriod  2391.855469
4                   DiscountFreq3M   112.973991
5          PlayerGrowthRate1W_lag0    89.153542
12   PositiveRateGrowthRate1W_lag7    78.362717
32                   GameID_881100    75.454437
30                   GameID_814380    74.470993
6          PlayerGrowthRate1W_lag7    69.034073
7         PlayerGrowthRate1W_lag14    66.182602
24                   GameID_457140    65.730629
2          AccumulatedPositiveRate    63.280010
8       FollowersGrowthRate1W_lag0    62.953403
17                   GameID_242760    59.131981
14                     GameID_3590    58.362545
13  PositiveRateGrowthRate1W_lag14    56.797211
10     FollowersGrowthRate1W_lag14    56.512192
25                   GameID_477160    56.059166
27                   GameID_582660    51.396744
3                      MultiPlayer    49.314743
19                   GameID_323190    48.800068
11   PositiveRat

#### 排序變數重要程度

In [74]:
best_xgb.fit(X_train[selected_gain], y_train)

r = permutation_importance(
    best_xgb, X_train[selected_gain], y_train,
    scoring='roc_auc',
    n_repeats=10,
    random_state=42
)

perm_df = pd.DataFrame({
    "feature": selected_gain,
    "importance": r.importances_mean
}).sort_values(by="importance", ascending=False)

perm_selected = perm_df[perm_df["importance"] > 0]["feature"].tolist()

print("=== Permutation Importance ===")
print(perm_df)

=== Permutation Importance ===
                           feature    importance
0                       SalePeriod  1.672710e-01
2          PlayerGrowthRate1W_lag0  2.456187e-02
1                   DiscountFreq3M  1.114294e-02
19   PositiveRateGrowthRate1W_lag0  6.575469e-03
7         PlayerGrowthRate1W_lag14  6.144938e-03
10      FollowersGrowthRate1W_lag0  5.829292e-03
14     FollowersGrowthRate1W_lag14  4.964091e-03
13  PositiveRateGrowthRate1W_lag14  4.917369e-03
3    PositiveRateGrowthRate1W_lag7  4.566941e-03
9          AccumulatedPositiveRate  3.804338e-03
6          PlayerGrowthRate1W_lag7  2.619616e-03
20      FollowersGrowthRate1W_lag7  2.424166e-03
24                             Age  2.315861e-03
21                   GameID_703080  5.408805e-04
8                    GameID_457140  4.636802e-04
23                   GameID_233860  4.068973e-04
12                     GameID_3590  3.808725e-04
11                   GameID_242760  2.033167e-04
15                   GameID_477160  1.

#### 刪除多餘的變數

In [75]:
tscv = TimeSeriesSplit(n_splits=5)

def get_auc(cols):
    return cross_val_score(
        best_xgb,
        X_train[cols], y_train,
        cv=tscv, scoring='roc_auc'
    ).mean()

auc_base = get_auc(perm_selected)
print("baseline AUC:", auc_base)

retain = []
auc_threshold = 0.003

for col in perm_selected:
    reduced = [c for c in perm_selected if c != col]
    auc = cross_val_score(best_xgb, X_train[reduced], y_train, cv=tscv, scoring='roc_auc').mean()

    if auc_base - auc >= auc_threshold:
        retain.append(col)

print("Keep after AUC drop test:", retain)


baseline AUC: 0.9524824821439466
Keep after AUC drop test: ['SalePeriod', 'PlayerGrowthRate1W_lag0', 'DiscountFreq3M']


#### 模型效果

In [76]:
X_train_final = X_train[retain]
X_test_final = X_test[retain]

result1 = evaluate_model('baseline', baseline_model, X_train_final, y_train, X_test_final, y_test)
result2 = evaluate_model('selection', best_xgb, X_train_final, y_train, X_test_final, y_test)


combined_results = pd.concat([result1, result2], keys=['baseline', 'selection'])
print("\n模型比較結果:")
print(combined_results)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== baseline ===
Confusion matrix:
 [[6794    0]
 [  28    0]]

=== selection ===
Confusion matrix:
 [[6316  478]
 [   0   28]]

模型比較結果:
                 Accuracy  F1 score     AUC
baseline  train    0.9898    0.0838  0.9850
          test     0.9959    0.0000  0.9718
selection train    0.9076    0.1846  0.9733
          test     0.9299    0.1049  0.9720
