# Background
- **Author**: `<郭伊軒>`
- **Created At**: `<2025-11-1>`
- **Path to Training Data： discount-timing-DE.csv**
- **Path to Testing Data： discount-timing-DE.csv**
- **Model Specification 
    - Method：logistic regression
    - Variables：  
    ['Age', 'PlayerGrowthRate1W', 'FollowersGrowthRate1W', 'PositiveRateGrowthRate1W', 'SalePeriod', 'AccumulatedPositiveRate', 'DLC_sum_1W', 'Sequel_sum_1W']
    - Tuning Parameters：
    - Optimization Method：
- **Main Findings and Takeaways：**
    - In-sample `<metric>`:
    - Out-sample `<metric>`:
- **Future Direciton：**

In [118]:
# Load packages here
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor



In [119]:
# Load the TRAINING data here and please finish all the data manipulation here.
input_data_file = "/Users/10610/Desktop/114-1 資料/steam-project/discount-timing-DE.csv"
df = pd.read_csv(input_data_file)
df_dummies = pd.get_dummies(df, columns=['GameID'], drop_first=True)

train = df_dummies[df_dummies['Date'] < '2025-01-01']
test = df_dummies[df_dummies['Date'] >= '2025-01-01']

def prepare_xy(df, feature_cols, target_col):
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    # 將 bool 欄轉成 int
    X = X.astype({col: 'int' for col in X.select_dtypes(bool).columns})
    X = sm.add_constant(X)
    return X, y


In [120]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GameID,23938.0,461376.742,298559.181056,10.0,244850.0,431730.0,644930.0,1145360.0
MultiPlayer,23938.0,0.464241,0.49873,0.0,0.0,0.0,1.0,1.0
ConstantDiscount,23938.0,0.214387,0.410405,0.0,0.0,0.0,0.0,1.0
DiscountOrNot,23938.0,0.019885,0.139607,0.0,0.0,0.0,0.0,1.0
DiscountDuration,23938.0,0.221196,1.715483,0.0,0.0,0.0,0.0,32.0
DiscountFreq3M,23938.0,1.797644,1.043279,0.0,1.0,2.0,3.0,6.0
Age,23938.0,7.634427,4.458471,2.389041,4.95137,6.323288,8.479452,24.84658
AccumulatedPositiveRate,23938.0,0.928061,0.064186,0.738751,0.905517,0.953165,0.972651,0.9929734
SalePeriod,23938.0,0.14642,0.353534,0.0,0.0,0.0,0.0,1.0
DiscountDuringSale,23938.0,0.008647,0.09259,0.0,0.0,0.0,0.0,1.0


### The actual modeling starts below
For the remaining blocks, make sure you have followed the guidelines as specified in [專案資料夾結構、檔案命名與文件規範](https://docs.google.com/document/d/1sl6gEFMdmiGsiNjLe17UmZ30xKxq15U0Mb2B-Jvusxg/edit?tab=t.33iie8ybx7s4).


In [121]:
def evaluate_model(name, model, X_test, y_test):
    y_prob = model.predict(X_test)
    y_pred = (y_prob >= 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")
    print("Confusion matrix:\n", cm)
    return {"Model": name, "Accuracy": acc, "F1": f1, "AUC": auc}

# 1W

In [151]:
feature_cols = [
    'Age', 'PlayerGrowthRate1W', 'FollowersGrowthRate1W', 'PositiveRateGrowthRate1W', 
    'SalePeriod', 'DLC_sum_1W', 'Sequel_sum_1W'
] + [col for col in df_dummies.columns if col.startswith('GameID_')]


### 所有折扣

#### model summary

In [152]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountOrNot')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountOrNot') 

logit_model = sm.Logit(y_train, X_train).fit(method='bfgs', maxiter=100)
print(logit_model.summary())

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


         Current function value: 0.090318
         Iterations: 100
         Function evaluations: 104
         Gradient evaluations: 104
                           Logit Regression Results                           
Dep. Variable:          DiscountOrNot   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17081
Method:                           MLE   Df Model:                           34
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                  0.1049
Time:                        22:37:24   Log-Likelihood:                -1545.9
converged:                      False   LL-Null:                       -1727.1
Covariance Type:            nonrobust   LLR p-value:                 1.352e-56
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -4.3054      2.75

PlayerGrowthRate1W、SalePeriod顯著

#### 共線性

In [149]:
#檢查共線性 AccumulatedPositiveRate 和 Age 有共線性問題
vif_data = pd.DataFrame()
vif_data["feature"] = X_train.columns[1:]  # 跳過常數項 'const'
vif_data["VIF"] = [
    variance_inflation_factor(X_train.iloc[:, 1:].values, i)
    for i in range(X_train.shape[1] - 1)
]
print(vif_data)

                     feature       VIF
0                        Age  4.034748
1         PlayerGrowthRate1W  1.217203
2      FollowersGrowthRate1W  4.855014
3   PositiveRateGrowthRate1W  1.525385
4                 SalePeriod  1.329102
5             DiscountFreq3M  7.956036
6                 DLC_sum_1W  1.116680
7              Sequel_sum_1W  1.016864
8                GameID_3590  1.676434
9                GameID_4000  1.556673
10             GameID_108600  1.576192
11             GameID_233860  1.500148
12             GameID_242760  1.171224
13             GameID_244210  1.945881
14             GameID_244850  1.229033
15             GameID_294100  1.240747
16             GameID_323190  1.473674
17             GameID_367520  1.195352
18             GameID_376210  1.335757
19             GameID_381210  1.476543
20             GameID_413150  1.446608
21             GameID_431730  1.483115
22             GameID_431960  1.292654
23             GameID_457140  1.115180
24             GameID_477

#### Wald test

In [153]:
# 1. 取得所有 dummy variable 的名稱列表
game_cols = [col for col in df_dummies.columns if col.startswith('GameID_')]
game_cnt = len(game_cols)
variable_cnt = len(feature_cols) + 1 # 包含常數項及其他變數的總數

# 2. 初始化 R 矩陣
R_matrix = np.zeros([game_cnt, variable_cnt])

# 3. 找出這些變數在模型參數列表中的位置，並設定 R 矩陣
for i, var_name in enumerate(game_cols):
    # 找到該變數在 model.params 中的索引位置
    param_index = logit_model.params.index.get_loc(var_name)
    R_matrix[i, param_index] = 1


print('\n unbalance')
print(logit_model.wald_test(R_matrix))


 unbalance
<Wald test (chi2): statistic=[[30.78915724]], p-value=0.27988385993194714, df_denom=27>




沒有明顯個體差異

#### 模型效果

In [126]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit(method='bfgs', maxiter=100)
result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))


         Current function value: 0.582864
         Iterations: 100
         Function evaluations: 103
         Gradient evaluations: 103

=== unbalance ===
Accuracy: 0.9823
F1-score: 0.0000
AUC: 0.6676
Confusion matrix:
 [[6701    0]
 [ 121    0]]

=== balance ===
Accuracy: 0.7573
F1-score: 0.0633
AUC: 0.6708
Confusion matrix:
 [[5110 1591]
 [  65   56]]

模型比較結果:
       Model  Accuracy        F1       AUC
1    balance  0.757256  0.063348  0.670842
0  unbalance  0.982263  0.000000  0.667612


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


有經過平衡處理的模型表現比較好

### 季節性折扣

#### model summary

In [127]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountDuringSale')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountDuringSale')
logit_model = sm.Logit(y_train, X_train).fit(method='bfgs', maxiter=100)
print(logit_model.summary())

         Current function value: 0.033763
         Iterations: 100
         Function evaluations: 106
         Gradient evaluations: 106
                           Logit Regression Results                           
Dep. Variable:     DiscountDuringSale   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17081
Method:                           MLE   Df Model:                           34
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                  0.4188
Time:                        20:16:14   Log-Likelihood:                -577.89
converged:                      False   LL-Null:                       -994.37
Covariance Type:            nonrobust   LLR p-value:                5.460e-153
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                      -12.5249      5.89

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


PlayerGrowthRate1W顯著 salePeriod(0.009)

#### 模型效果

In [128]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit()
result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))


         Current function value: 0.162262
         Iterations: 35

=== unbalance ===
Accuracy: 0.9959
F1-score: 0.0000
AUC: 0.9701
Confusion matrix:
 [[6794    0]
 [  28    0]]

=== balance ===
Accuracy: 0.9227
F1-score: 0.0929
AUC: 0.9728
Confusion matrix:
 [[6268  526]
 [   1   27]]

模型比較結果:
       Model  Accuracy        F1       AUC
1    balance  0.922750  0.092943  0.972770
0  unbalance  0.995896  0.000000  0.970126




有經過平衡處理的模型表現比較好

### 非季節性折扣

#### model summary

In [129]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountOutOfSale')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountOutOfSale')
logit_model = sm.Logit(y_train, X_train).fit(method='bfgs', maxiter=100)
print(logit_model.summary())

         Current function value: 0.052374
         Iterations: 100
         Function evaluations: 104
         Gradient evaluations: 104
                           Logit Regression Results                           
Dep. Variable:      DiscountOutOfSale   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17081
Method:                           MLE   Df Model:                           34
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                 0.08593
Time:                        20:16:15   Log-Likelihood:                -896.43
converged:                      False   LL-Null:                       -980.69
Covariance Type:            nonrobust   LLR p-value:                 9.619e-20
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -4.0640      3.87

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


DLC_sum_1W(0.001)、SalePeriod(0.018)

#### 模型效果

In [130]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit(method='bfgs', maxiter=100)
result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))


         Current function value: 0.501236
         Iterations: 100
         Function evaluations: 103
         Gradient evaluations: 103

=== unbalance ===
Accuracy: 0.9864
F1-score: 0.0000
AUC: 0.6767
Confusion matrix:
 [[6729    0]
 [  93    0]]

=== balance ===
Accuracy: 0.6774
F1-score: 0.0476
AUC: 0.6693
Confusion matrix:
 [[4566 2163]
 [  38   55]]

模型比較結果:
       Model  Accuracy        F1      AUC
1    balance  0.677367  0.047598  0.66933
0  unbalance  0.986368  0.000000  0.67674


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


# 2W

In [131]:
feature_cols = [
    'Age', 'PlayerGrowthRate2W', 'FollowersGrowthRate2W', 'PositiveRateGrowthRate2W', 
    'SalePeriod', 'DLC_sum_2W', 'Sequel_sum_2W'
] + [col for col in df_dummies.columns if col.startswith('GameID_')]

### 所有折扣

#### model summary

In [132]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountOrNot')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountOrNot')
logit_model = sm.Logit(y_train, X_train).fit(method='bfgs', maxiter=100)
print(logit_model.summary())


         Current function value: 0.091056
         Iterations: 100
         Function evaluations: 105
         Gradient evaluations: 105
                           Logit Regression Results                           
Dep. Variable:          DiscountOrNot   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17081
Method:                           MLE   Df Model:                           34
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                 0.09764
Time:                        20:16:17   Log-Likelihood:                -1558.5
converged:                      False   LL-Null:                       -1727.1
Covariance Type:            nonrobust   LLR p-value:                 1.298e-51
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -4.2500      2.73

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


PlayerGrowthRate2W、SalePeriod顯著

#### 共線性

In [133]:
#檢查共線性 AccumulatedPositiveRate 有共線性問題
vif_data = pd.DataFrame()
vif_data["feature"] = X_train.columns[1:]  # 跳過常數項 'const'
vif_data["VIF"] = [
    variance_inflation_factor(X_train.iloc[:, 1:].values, i)
    for i in range(X_train.shape[1] - 1)
]
print(vif_data)

                     feature       VIF
0                        Age  3.815768
1         PlayerGrowthRate2W  1.264848
2      FollowersGrowthRate2W  6.184148
3   PositiveRateGrowthRate2W  1.740996
4                 SalePeriod  1.216721
5                 DLC_sum_2W  1.239650
6              Sequel_sum_2W  1.034641
7                GameID_3590  1.773400
8                GameID_4000  1.553381
9              GameID_108600  1.676252
10             GameID_233860  1.170403
11             GameID_242760  1.100615
12             GameID_244210  1.980919
13             GameID_244850  1.055709
14             GameID_294100  1.096492
15             GameID_323190  1.087423
16             GameID_367520  1.180141
17             GameID_376210  1.322967
18             GameID_381210  1.290005
19             GameID_413150  1.384933
20             GameID_431730  1.598567
21             GameID_431960  1.370347
22             GameID_457140  1.064835
23             GameID_477160  1.111121
24             GameID_548

#### Wald test

In [134]:
# 1. 取得所有 dummy variable 的名稱列表
game_cols = [col for col in df_dummies.columns if col.startswith('GameID_')]
game_cnt = len(game_cols)
variable_cnt = len(feature_cols) + 1 # 包含常數項及其他變數的總數

# 2. 初始化 R 矩陣
R_matrix = np.zeros([game_cnt, variable_cnt])

# 3. 找出這些變數在模型參數列表中的位置，並設定 R 矩陣
for i, var_name in enumerate(game_cols):
    # 找到該變數在 model.params 中的索引位置
    param_index = logit_model.params.index.get_loc(var_name)
    R_matrix[i, param_index] = 1


print('\n unbalance')
print(logit_model.wald_test(R_matrix))



 unbalance
<Wald test (chi2): statistic=[[34.04343982]], p-value=0.1647882324046394, df_denom=27>




個體沒有明顯差異

#### 模型效果

In [135]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit()
result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))


         Current function value: 0.567112
         Iterations: 35

=== unbalance ===
Accuracy: 0.9823
F1-score: 0.0000
AUC: 0.6770
Confusion matrix:
 [[6701    0]
 [ 121    0]]

=== balance ===
Accuracy: 0.7675
F1-score: 0.0649
AUC: 0.6689
Confusion matrix:
 [[5181 1520]
 [  66   55]]

模型比較結果:
       Model  Accuracy        F1       AUC
1    balance  0.767517  0.064858  0.668907
0  unbalance  0.982263  0.000000  0.676987




### 季節性折扣

#### model summary

In [136]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountDuringSale')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountDuringSale')
logit_model = sm.Logit(y_train, X_train).fit(method='bfgs', maxiter=100)
print(logit_model.summary())

         Current function value: 0.036569
         Iterations: 100
         Function evaluations: 105
         Gradient evaluations: 105
                           Logit Regression Results                           
Dep. Variable:     DiscountDuringSale   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17081
Method:                           MLE   Df Model:                           34
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                  0.3705
Time:                        20:16:21   Log-Likelihood:                -625.92
converged:                      False   LL-Null:                       -994.37
Covariance Type:            nonrobust   LLR p-value:                5.565e-133
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                      -13.2889      8.97

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


PlayerGrowthRate2W顯著

#### 模型效果

In [137]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit()
result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))


         Current function value: 0.184827
         Iterations: 35

=== unbalance ===
Accuracy: 0.9959
F1-score: 0.0000
AUC: 0.9668
Confusion matrix:
 [[6794    0]
 [  28    0]]

=== balance ===
Accuracy: 0.9163
F1-score: 0.0835
AUC: 0.9688
Confusion matrix:
 [[6225  569]
 [   2   26]]

模型比較結果:
       Model  Accuracy        F1      AUC
1    balance  0.916300  0.083467  0.96878
0  unbalance  0.995896  0.000000  0.96683




### 非季節性折扣

#### model summary

In [138]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountOutOfSale')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountOutOfSale')
logit_model = sm.Logit(y_train, X_train).fit(method='bfgs', maxiter=100)
print(logit_model.summary())

         Current function value: 0.052292
         Iterations: 100
         Function evaluations: 104
         Gradient evaluations: 104
                           Logit Regression Results                           
Dep. Variable:      DiscountOutOfSale   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17081
Method:                           MLE   Df Model:                           34
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                 0.08735
Time:                        20:16:23   Log-Likelihood:                -895.03
converged:                      False   LL-Null:                       -980.69
Covariance Type:            nonrobust   LLR p-value:                 3.081e-20
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -4.0765      3.91

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


PlayerGrowthRate2W(0.022)、SalePeriod(0.019)、DLC_sum_2W(0.011)

#### 模型效果

In [139]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit(method='bfgs', maxiter=100)
result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))


         Current function value: 0.497677
         Iterations: 100
         Function evaluations: 103
         Gradient evaluations: 103

=== unbalance ===
Accuracy: 0.9864
F1-score: 0.0000
AUC: 0.6849
Confusion matrix:
 [[6729    0]
 [  93    0]]

=== balance ===
Accuracy: 0.7001
F1-score: 0.0493
AUC: 0.6766
Confusion matrix:
 [[4723 2006]
 [  40   53]]

模型比較結果:
       Model  Accuracy        F1       AUC
1    balance  0.700088  0.049257  0.676603
0  unbalance  0.986368  0.000000  0.684904


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


# 增加遊戲類別

### 1W

In [140]:
feature_cols = ['Age', "MultiPlayer", 'PlayerGrowthRate1W', 'FollowersGrowthRate1W', 'PositiveRateGrowthRate1W', 
                'SalePeriod', 'DiscountFreq3M', 'DLC_sum_1W', 'Sequel_sum_1W']

#### model summry

In [141]:
# 應變數與自變數
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountOrNot')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountOrNot')
logit_model = sm.Logit(y_train, X_train).fit(method='bfgs', maxiter=100)
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.087920
         Iterations: 71
         Function evaluations: 75
         Gradient evaluations: 75
                           Logit Regression Results                           
Dep. Variable:          DiscountOrNot   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17106
Method:                           MLE   Df Model:                            9
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                  0.1287
Time:                        20:16:24   Log-Likelihood:                -1504.8
converged:                       True   LL-Null:                       -1727.1
Covariance Type:            nonrobust   LLR p-value:                 4.036e-90
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const     

PlayerGrowthRate1W、SalePeriod、DiscountFreq3M、DLC_sum_1W(0.079)

#### 模型效果

In [142]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit(method='bfgs', maxiter=100)

result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))

Optimization terminated successfully.
         Current function value: 0.532743
         Iterations: 74
         Function evaluations: 77
         Gradient evaluations: 77

=== unbalance ===
Accuracy: 0.9823
F1-score: 0.0000
AUC: 0.7374
Confusion matrix:
 [[6701    0]
 [ 121    0]]

=== balance ===
Accuracy: 0.7060
F1-score: 0.0764
AUC: 0.7369
Confusion matrix:
 [[4733 1968]
 [  38   83]]

模型比較結果:
       Model  Accuracy        F1       AUC
1    balance  0.705951  0.076427  0.736874
0  unbalance  0.982263  0.000000  0.737377


### 2W

In [143]:
feature_cols = ['Age', "MultiPlayer", 'PlayerGrowthRate2W', 'FollowersGrowthRate2W', 'PositiveRateGrowthRate2W', 
                'SalePeriod', 'DiscountFreq3M', 'DLC_sum_2W', 'Sequel_sum_2W']

#### model summry

In [144]:
X_train, y_train = prepare_xy(train, feature_cols, 'DiscountOrNot')
X_test, y_test = prepare_xy(test, feature_cols, 'DiscountOrNot')
logit_model = sm.Logit(y_train, X_train).fit()
print(logit_model.summary())

         Current function value: 0.088255
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:          DiscountOrNot   No. Observations:                17116
Model:                          Logit   Df Residuals:                    17106
Method:                           MLE   Df Model:                            9
Date:                Sun, 09 Nov 2025   Pseudo R-squ.:                  0.1254
Time:                        20:16:25   Log-Likelihood:                -1510.6
converged:                      False   LL-Null:                       -1727.1
Covariance Type:            nonrobust   LLR p-value:                 1.151e-87
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -5.4719      0.220    -24.890      0.000      -5.903      -5.041
Age                    



#### 模型效果

In [145]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
logit_model_sm = sm.Logit(y_train_sm, X_train_sm).fit(method='bfgs', maxiter=100)
result1 = evaluate_model('unbalance', logit_model, X_test, y_test)
result2 = evaluate_model('balance', logit_model_sm, X_test, y_test)

results = pd.DataFrame([result1, result2])
print("\n模型比較結果:")
print(results.sort_values(by="F1", ascending=False))

Optimization terminated successfully.
         Current function value: 0.534179
         Iterations: 71
         Function evaluations: 73
         Gradient evaluations: 73

=== unbalance ===
Accuracy: 0.9823
F1-score: 0.0000
AUC: 0.7412
Confusion matrix:
 [[6701    0]
 [ 121    0]]

=== balance ===
Accuracy: 0.6970
F1-score: 0.0727
AUC: 0.7399
Confusion matrix:
 [[4674 2027]
 [  40   81]]

模型比較結果:
       Model  Accuracy        F1       AUC
1    balance  0.697010  0.072678  0.739930
0  unbalance  0.982263  0.000000  0.741173
