# Background
- **Author**: `<林宜萱>`
- **Created At**: `<2025-10-26>`
- **Path to Training Data：extent-of-discount-rate-DE_train.csv**
- **Path to Testing Data：extent-of-discount-rate-DE_test.csv**
- **Model Specification 
    - Method：Linear Model
    - Variables：
    Dependent Variable (y): DiscountRate  
    Independent Variables (X):  
    ["Age", "AccumulatedPositiveRate", "SalePeriod","DiscountFreq3M",
 "PlayerGrowthRate1W", "PlayerGrowthRate2W", "PlayerGrowthRate1M",
 "FollowersGrowthRate1W", "FollowersGrowthRate2W", "FollowersGrowthRate1M",
 "PositiveRateGrowthRate1W", "PositiveRateGrowthRate2W", "PositiveRateGrowthRate1M",
 "DLC_since_last_discount", "Sequel_since_last_discount"]
    - Tuning Parameters：
    test_size = 0.2  
    random_state = 42
    - Optimization Method：
    Ordinary Least Squares (OLS) estimation  
    Implemented via LinearRegression() from scikit-learn
- **Main Findings and Takeaways：**
    - In-sample `< R², RMSE>`:  
    1w(0.0276, 0.1875)、2w(0.0299, 0.1873)、1m(0.0292, 0.1873)
    - Out-sample `< R², RMSE>`:  
    1w(0.0545, 0.1496)、2w(0.0569, 0.1494)、1m(0.0594, 0.1492)
    - Interpretation:  

- **Future Direciton：嘗試使用非線性模型（如 Random Forest、XGBoost）以捕捉特徵間交互作用。**

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
import numpy as np

### - 1 week

In [72]:
# 讀取資料
df = pd.read_csv('../data/processed/extent-of-discount-rate-DE.csv')


# 定義特徵與目標變數
X = df[["Age", "AccumulatedPositiveRate", "SalePeriod","DiscountFreq3M",
        "PlayerGrowthRate1W", "FollowersGrowthRate1W",
        "PositiveRateGrowthRate1W", "DLC_since_last_discount",
        "Sequel_since_last_discount"]]
y = df["DiscountRate"]

# 切分訓練與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 顯示訓練資料前 10 筆
print("Training Data Preview:")
display(X_train.head(10))

Training Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,DiscountFreq3M,PlayerGrowthRate1W,FollowersGrowthRate1W,PositiveRateGrowthRate1W,DLC_since_last_discount,Sequel_since_last_discount
108,8.972603,0.925576,0,3,-0.053141,0.003913,0.000119,0,0
377,5.315068,0.910768,0,3,-0.025657,0.000722,2.6e-05,0,0
247,6.939726,0.947181,1,4,-0.162881,0.000351,-1.9e-05,0,0
175,7.630137,0.803694,0,3,-0.097686,9.5e-05,4.5e-05,0,0
3,23.150685,0.974675,1,2,-0.029452,0.00079,1.6e-05,0,0
18,11.060274,0.941803,1,2,-0.083322,0.002566,3.8e-05,0,0
405,4.180822,0.952784,1,3,-0.073015,0.000598,-1.5e-05,0,0
400,3.482192,0.953422,0,2,0.072053,0.001135,2.2e-05,0,0
181,8.356164,0.803685,0,2,0.133237,0.000625,-6.5e-05,1,0
63,4.487671,0.982653,1,3,-0.073645,0.000419,-3e-06,0,0


In [73]:
print("Testing Data Preview:")
display(X_test.head(10))

Testing Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,DiscountFreq3M,PlayerGrowthRate1W,FollowersGrowthRate1W,PositiveRateGrowthRate1W,DLC_since_last_discount,Sequel_since_last_discount
172,7.350685,0.800858,0,3,0.23683,0.000357,2.1e-05,0,0
137,5.750685,0.884745,1,3,-0.076967,0.00052,2.8e-05,0,0
126,4.660274,0.883921,1,1,0.167426,0.000369,2.9e-05,0,0
94,5.564384,0.954689,1,2,-0.334482,0.000638,4.2e-05,0,0
72,5.016438,0.95234,0,3,-0.058257,0.001142,5e-06,0,0
33,3.260274,0.815636,1,3,-0.030299,0.000837,0.000512,0,0
379,5.427397,0.910925,0,4,-0.065791,0.000806,3.9e-05,0,0
223,7.353425,0.992643,1,1,0.034206,0.003179,4e-05,0,0
341,4.512329,0.851182,1,4,-0.047289,0.000206,5e-05,0,0
227,8.350685,0.991291,1,1,-0.000311,0.003272,-1.8e-05,0,0


#### The actual modeling starts below

In [74]:
# 建立線性迴歸模型
model = LinearRegression()

# 使用訓練資料進行模型擬合
model.fit(X_train, y_train)

# 輸出模型截距與係數
print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")

Intercept: 0.8379349439900005
Coefficients:
  Age: 0.0050
  AccumulatedPositiveRate: -0.3441
  SalePeriod: -0.0243
  DiscountFreq3M: 0.0148
  PlayerGrowthRate1W: -0.1890
  FollowersGrowthRate1W: -5.7000
  PositiveRateGrowthRate1W: -191.2908
  DLC_since_last_discount: -0.0804
  Sequel_since_last_discount: 0.1160


In [75]:
# 訓練資料預測
y_train_pred = model.predict(X_train)

# 評估模型表現
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Training Performance:")
print(f"  R²: {r2_train:.4f}")
print(f"  RMSE: {rmse_train:.4f}")

Training Performance:
  R²: 0.0665
  RMSE: 0.1895


In [76]:
# 測試資料預測
y_test_pred = model.predict(X_test)

# 評估模型表現
r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Testing Performance:")
print(f"  R²: {r2_test:.4f}")
print(f"  RMSE: {rmse_test:.4f}")

Testing Performance:
  R²: -0.0141
  RMSE: 0.1843


### - 2 week

In [101]:
# 讀取資料
df = pd.read_csv('../data/processed/extent-of-discount-rate-DE.csv')


# 定義特徵與目標變數
X = df[["Age", "AccumulatedPositiveRate", "SalePeriod","DiscountFreq3M",
        "PlayerGrowthRate2W", "FollowersGrowthRate2W",
        "PositiveRateGrowthRate2W", "DLC_since_last_discount",
        "Sequel_since_last_discount"]]
y = df["DiscountRate"]

# 切分訓練與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 顯示訓練資料前 10 筆
print("Training Data Preview:")
display(X_train.head(10))

Training Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,DiscountFreq3M,PlayerGrowthRate2W,FollowersGrowthRate2W,PositiveRateGrowthRate2W,DLC_since_last_discount,Sequel_since_last_discount
108,8.972603,0.925576,0,3,0.130636,0.013138,0.0006442504,0,0
377,5.315068,0.910768,0,3,-0.088498,0.001517,9.851996e-05,0,0
247,6.939726,0.947181,1,4,-0.391481,0.000725,-3.363982e-05,0,0
175,7.630137,0.803694,0,3,-0.087062,0.000388,1.23597e-05,0,0
3,23.150685,0.974675,1,2,-0.033895,0.001743,1.407597e-05,0,0
18,11.060274,0.941803,1,2,-0.12154,0.005281,6.320325e-05,0,0
405,4.180822,0.952784,1,3,-0.116096,0.001326,4.048527e-07,0,0
400,3.482192,0.953422,0,2,0.093497,0.002142,8.517998e-05,0,0
181,8.356164,0.803685,0,2,0.090745,0.001214,-9.334816e-05,1,0
63,4.487671,0.982653,1,3,-0.095878,0.001051,-3.588388e-06,0,0


In [102]:
print("Testing Data Preview:")
display(X_test.head(10))

Testing Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,DiscountFreq3M,PlayerGrowthRate2W,FollowersGrowthRate2W,PositiveRateGrowthRate2W,DLC_since_last_discount,Sequel_since_last_discount
172,7.350685,0.800858,0,3,0.10373,0.000554,1.6e-05,0,0
137,5.750685,0.884745,1,3,-0.18492,0.001139,6.3e-05,0,0
126,4.660274,0.883921,1,1,0.016719,0.000801,0.000134,0,0
94,5.564384,0.954689,1,2,-0.695663,0.001265,6.7e-05,0,0
72,5.016438,0.95234,0,3,-0.050778,0.002133,-2.1e-05,0,0
33,3.260274,0.815636,1,3,-0.210916,0.001681,0.00124,0,0
379,5.427397,0.910925,0,4,-0.104151,0.00181,4.6e-05,0,0
223,7.353425,0.992643,1,1,-0.020817,0.006225,-5e-06,0,0
341,4.512329,0.851182,1,4,-0.087033,0.000409,-6.5e-05,0,0
227,8.350685,0.991291,1,1,-0.028793,0.006616,1.4e-05,0,0


#### The actual modeling starts below

In [103]:
# 建立線性迴歸模型
model = LinearRegression()

# 使用訓練資料進行模型擬合
model.fit(X_train, y_train)

# 輸出模型截距與係數
print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")

Intercept: 0.7829311919443198
Coefficients:
  Age: 0.0053
  AccumulatedPositiveRate: -0.3023
  SalePeriod: -0.0206
  DiscountFreq3M: 0.0181
  PlayerGrowthRate2W: -0.0640
  FollowersGrowthRate2W: -0.9203
  PositiveRateGrowthRate2W: -67.6974
  DLC_since_last_discount: -0.0791
  Sequel_since_last_discount: 0.1081


In [104]:
# 訓練資料預測
y_train_pred = model.predict(X_train)

# 評估模型表現
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Training Performance:")
print(f"  R²: {r2_train:.4f}")
print(f"  RMSE: {rmse_train:.4f}")

Training Performance:
  R²: 0.0409
  RMSE: 0.1920


In [105]:
# 測試資料預測
y_test_pred = model.predict(X_test)

# 評估模型表現
r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Testing Performance:")
print(f"  R²: {r2_test:.4f}")
print(f"  RMSE: {rmse_test:.4f}")

Testing Performance:
  R²: -0.0141
  RMSE: 0.1843


### - 1 month

In [106]:
# 讀取資料
df = pd.read_csv('../data/processed/extent-of-discount-rate-DE.csv')


# 定義特徵與目標變數
X = df[["Age", "AccumulatedPositiveRate", "SalePeriod","DiscountFreq3M",
        "PlayerGrowthRate1M", "FollowersGrowthRate1M",
        "PositiveRateGrowthRate1M", "DLC_since_last_discount",
        "Sequel_since_last_discount"]]
y = df["DiscountRate"]


# 切分訓練與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 顯示訓練資料前 10 筆
print("Training Data Preview:")
display(X_train.head(10))

Training Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,DiscountFreq3M,PlayerGrowthRate1M,FollowersGrowthRate1M,PositiveRateGrowthRate1M,DLC_since_last_discount,Sequel_since_last_discount
108,8.972603,0.925576,0,3,0.016386,0.023131,0.001276,0,0
377,5.315068,0.910768,0,3,-0.221324,0.003985,0.000232,0,0
247,6.939726,0.947181,1,4,0.401201,0.002101,-3.4e-05,0,0
175,7.630137,0.803694,0,3,-0.110737,0.001194,4.4e-05,0,0
3,23.150685,0.974675,1,2,-0.006214,0.004781,-1.1e-05,0,0
18,11.060274,0.941803,1,2,0.102063,0.012877,-1.6e-05,0,0
405,4.180822,0.952784,1,3,-0.124507,0.00394,0.000609,0,0
400,3.482192,0.953422,0,2,0.289667,0.004669,-3.2e-05,0,0
181,8.356164,0.803685,0,2,0.0222,0.002269,-0.000199,1,0
63,4.487671,0.982653,1,3,-0.092566,0.002345,-3.2e-05,0,0


In [107]:
print("Testing Data Preview:")
display(X_test.head(10))

Testing Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,DiscountFreq3M,PlayerGrowthRate1M,FollowersGrowthRate1M,PositiveRateGrowthRate1M,DLC_since_last_discount,Sequel_since_last_discount
172,7.350685,0.800858,0,3,-0.105184,0.001093,-7.3e-05,0,0
137,5.750685,0.884745,1,3,-0.02664,0.002766,0.000235,0,0
126,4.660274,0.883921,1,1,-0.034252,0.001809,0.000205,0,0
94,5.564384,0.954689,1,2,2.322294,0.007217,0.000148,0,0
72,5.016438,0.95234,0,3,-0.148136,0.005379,0.000323,0,0
33,3.260274,0.815636,1,3,-0.046926,0.003939,0.002761,0,0
379,5.427397,0.910925,0,4,-0.069808,0.004583,0.000157,0,0
223,7.353425,0.992643,1,1,-0.002962,0.013023,0.00014,0,0
341,4.512329,0.851182,1,4,-0.034328,0.001014,7.7e-05,0,0
227,8.350685,0.991291,1,1,0.00159,0.0151,3.8e-05,0,0


#### The actual modeling starts below

In [108]:
# 建立線性迴歸模型
model = LinearRegression()

# 使用訓練資料進行模型擬合
model.fit(X_train, y_train)

# 輸出模型截距與係數
print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")

Intercept: 0.7588688892817459
Coefficients:
  Age: 0.0055
  AccumulatedPositiveRate: -0.2700
  SalePeriod: -0.0183
  DiscountFreq3M: 0.0161
  PlayerGrowthRate1M: 0.0013
  FollowersGrowthRate1M: -0.9691
  PositiveRateGrowthRate1M: -14.3258
  DLC_since_last_discount: -0.0764
  Sequel_since_last_discount: 0.1016


In [109]:
# 訓練資料預測
y_train_pred = model.predict(X_train)

# 評估模型表現
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Training Performance:")
print(f"  R²: {r2_train:.4f}")
print(f"  RMSE: {rmse_train:.4f}")

Training Performance:
  R²: 0.0307
  RMSE: 0.1931


In [110]:
# 測試資料預測
y_test_pred = model.predict(X_test)

# 評估模型表現
r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Testing Performance:")
print(f"  R²: {r2_test:.4f}")
print(f"  RMSE: {rmse_test:.4f}")

Testing Performance:
  R²: 0.0069
  RMSE: 0.1824


### 　嘗試優化回歸模型

1. 「正則化迴歸」（Ridge / Lasso）

In [111]:
ridge = Ridge(alpha=0.1, random_state=42)
ridge.fit(X_train_scaled, y_train)

y_train_pred = ridge.predict(X_train_scaled)
y_test_pred = ridge.predict(X_test_scaled)

print("Ridge - Training R²:", r2_score(y_train, y_train_pred))
print("Ridge - Testing R²:", r2_score(y_test, y_test_pred))
print("Ridge - RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))

Ridge - Training R²: 0.025665411515462422
Ridge - Testing R²: 0.014679157907352303
Ridge - RMSE: 0.18166289106208514


2. 變數選擇（RFE: Recursive Feature Elimination）

調整 Ridge 超參數 α
小 α → 模型更接近線性回歸；大 α → 模型更平滑。

In [112]:
from sklearn.feature_selection import RFE

selector = RFE(estimator=LinearRegression(), n_features_to_select=5)
selector.fit(X_train_scaled, y_train)

selected_features = X.columns[selector.support_]
print("Selected Features:", selected_features)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 9 but corresponding boolean dimension is 8

In [None]:
# 讀取資料
df = pd.read_csv('../data/processed/extent-of-discount-rate-DE.csv')

# 定義特徵與目標變數
features = ["Age", "AccumulatedPositiveRate", "SalePeriod",
            "PositiveRateGrowthRate1M", "DLC_since_last_discount"]
X = df[features]
y = df["DiscountRate"]

# 切分訓練與測試資料
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 建立 RidgeCV 模型，使用 5 折交叉驗證自動尋找最佳 alpha
ridge_cv = RidgeCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5)
ridge_cv.fit(X_train, y_train)

# 預測訓練與測試集
y_train_pred = ridge_cv.predict(X_train)
y_test_pred = ridge_cv.predict(X_test)

# 輸出結果
print("Best alpha:", ridge_cv.alpha_)
print(f"R² (Train): {r2_score(y_train, y_train_pred):.4f}")
print(f"R² (Test): {r2_score(y_test, y_test_pred):.4f}")
print(f"RMSE (Test): {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")

# 顯示係數
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": ridge_cv.coef_
})
print("\nModel Coefficients:")
print(coef_df)


Best alpha: 100.0
R² (Train): 0.0098
R² (Test): 0.0116
RMSE (Test): 0.1819

Model Coefficients:
                    Feature  Coefficient
0                       Age     0.004346
1   AccumulatedPositiveRate    -0.002810
2                SalePeriod    -0.010079
3  PositiveRateGrowthRate1M    -0.000026
4   DLC_since_last_discount    -0.004067


RidgeCV 模型在訓練資料的決定係數為 R² = 0.0098，測試資料為 R² = 0.0116，顯示模型幾乎無法以線性方式捕捉折扣率的變化。
儘管模型穩定且未過擬合，但說明力非常低，表示折扣率的形成機制高度非線性。
在各變數中，遊戲年齡（Age）與累積好評率（AccumulatedPositiveRate）對折扣方向有微弱影響，顯示舊遊戲與評價較低的遊戲可能會提供較高的折扣。