# Background
- **Author**: `<林宜萱>`
- **Created At**: `<2025-10-26>`
- **Path to Training Data：extent-of-discount-rate-DE_train.csv**
- **Path to Testing Data：extent-of-discount-rate-DE_test.csv**
- **Model Specification 
    - Method：Linear Model
    - Variables：
    Dependent Variable (y): DiscountRate  
    Independent Variables (X):  
    ["Age", "AccumulatedPositiveRate", "SalePeriod",
 "PlayerGrowthRate1W", "PlayerGrowthRate2W", "PlayerGrowthRate1M",
 "FollowersGrowthRate1W", "FollowersGrowthRate2W", "FollowersGrowthRate1M",
 "PositiveRateGrowthRate1W", "PositiveRateGrowthRate2W", "PositiveRateGrowthRate1M",
 "DLC_since_last_discount", "Sequel_since_last_discount"]
    - Tuning Parameters：
    test_size = 0.2  
    random_state = 42
    - Optimization Method：
    Ordinary Least Squares (OLS) estimation  
    Implemented via LinearRegression() from scikit-learn
- **Main Findings and Takeaways：**
    - In-sample `< R², RMSE>`:  
    1w(0.0276, 0.1875)、2w(0.0299, 0.1873)、1m(0.0292, 0.1873)
    - Out-sample `< R², RMSE>`:  
    1w(0.0545, 0.1496)、2w(0.0569, 0.1494)、1m(0.0594, 0.1492)
    - Interpretation:  

- **Future Direciton：嘗試使用非線性模型（如 Random Forest、XGBoost）以捕捉特徵間交互作用。**

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

### - 1 week

In [66]:
# 讀取資料
df = pd.read_csv('../data/processed/extent-of-discount-rate-DE.csv')


# 定義特徵與目標變數
X = df[["Age", "AccumulatedPositiveRate", "SalePeriod",
        "PlayerGrowthRate1W", "FollowersGrowthRate1W",
        "PositiveRateGrowthRate1W", "DLC_since_last_discount",
        "Sequel_since_last_discount"]]
y = df["DiscountRate"]

# 切分訓練與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 顯示訓練資料前 10 筆
print("Training Data Preview:")
display(X_train.head(10))

Training Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,PlayerGrowthRate1W,FollowersGrowthRate1W,PositiveRateGrowthRate1W,DLC_since_last_discount,Sequel_since_last_discount
18,11.060274,0.941803,1,-0.083322,0.002566,3.8e-05,0,0
203,18.378082,0.967921,1,0.006607,0.000635,5e-06,0,0
351,5.252055,0.851617,0,0.024654,0.000248,1.5e-05,0,0
275,8.356164,0.94703,1,-0.097654,0.0002,1.6e-05,0,0
63,4.564384,0.951567,1,-0.095179,0.001978,7.4e-05,0,0
249,4.791781,0.967286,0,-0.064399,0.000422,5e-06,0,0
302,6.254795,0.749023,0,-0.080332,0.001233,0.000263,0,0
108,9.884932,0.92499,0,-0.067359,0.00281,4e-05,0,0
90,6.079452,0.955034,0,0.015896,0.000419,-5e-06,0,0
234,8.857534,0.444776,1,0.0,-0.001718,0.0,0,0


In [67]:
print("Testing Data Preview:")
display(X_test.head(10))

Testing Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,PlayerGrowthRate1W,FollowersGrowthRate1W,PositiveRateGrowthRate1W,DLC_since_last_discount,Sequel_since_last_discount
285,3.526027,0.969657,1,-0.07758,0.001286,6.2e-05,0,0
281,3.090411,0.970458,0,0.0997,0.001003,-1e-05,0,0
33,3.260274,0.815636,1,-0.030299,0.000837,0.000512,0,0
211,8.178082,0.982537,0,-0.097956,0.00331,0.001163,0,0
93,6.583562,0.955034,1,-0.293778,0.00062,6e-06,0,0
84,5.167123,0.954422,1,-0.023813,0.000412,1.7e-05,0,0
391,3.10137,0.953009,1,-0.09884,0.001803,3.7e-05,0,0
94,6.643836,0.955118,1,-0.416542,0.000617,6e-06,0,0
225,7.824658,0.45122,0,5921.714286,0.0,0.003049,0,0
126,5.750685,0.884745,1,-0.076967,0.00052,2.8e-05,0,0


#### The actual modeling starts below

In [68]:
# 建立線性迴歸模型
model = LinearRegression()

# 使用訓練資料進行模型擬合
model.fit(X_train, y_train)

# 輸出模型截距與係數
print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")

Intercept: 0.5110281196362887
Coefficients:
  Age: 0.0043
  AccumulatedPositiveRate: 0.0530
  SalePeriod: -0.0367
  PlayerGrowthRate1W: -0.0000
  FollowersGrowthRate1W: -3.4044
  PositiveRateGrowthRate1W: -19.0066
  DLC_since_last_discount: 0.0161
  Sequel_since_last_discount: 0.1549


In [69]:
# 訓練資料預測
y_train_pred = model.predict(X_train)

# 評估模型表現
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Training Performance:")
print(f"  R²: {r2_train:.4f}")
print(f"  RMSE: {rmse_train:.4f}")

Training Performance:
  R²: 0.0276
  RMSE: 0.1875


In [70]:
# 測試資料預測
y_test_pred = model.predict(X_test)

# 評估模型表現
r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Testing Performance:")
print(f"  R²: {r2_test:.4f}")
print(f"  RMSE: {rmse_test:.4f}")

Testing Performance:
  R²: 0.0545
  RMSE: 0.1496


### - 2 week

In [71]:
# 讀取資料
df = pd.read_csv('../data/processed/extent-of-discount-rate-DE.csv')


# 定義特徵與目標變數
X = df[["Age", "AccumulatedPositiveRate", "SalePeriod",
        "PlayerGrowthRate2W", "FollowersGrowthRate2W",
        "PositiveRateGrowthRate2W", "DLC_since_last_discount",
        "Sequel_since_last_discount"]]
y = df["DiscountRate"]

# 切分訓練與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 顯示訓練資料前 10 筆
print("Training Data Preview:")
display(X_train.head(10))

Training Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,PlayerGrowthRate2W,FollowersGrowthRate2W,PositiveRateGrowthRate2W,DLC_since_last_discount,Sequel_since_last_discount
18,11.060274,0.941803,1,-0.12154,0.005281,6.3e-05,0,0
203,18.378082,0.967921,1,-0.04237,0.001301,1.8e-05,0,0
351,5.252055,0.851617,0,0.059666,0.000512,-3e-05,0,0
275,8.356164,0.94703,1,-0.423671,0.000458,1.4e-05,0,0
63,4.564384,0.951567,1,0.262779,0.00443,-7.5e-05,0,0
249,4.791781,0.967286,0,-0.048033,0.001024,3.6e-05,0,0
302,6.254795,0.749023,0,-0.166326,0.002349,0.000467,0,0
108,9.884932,0.92499,0,-0.049449,0.005543,0.000171,0,0
90,6.079452,0.955034,0,-0.10132,0.0007,4e-06,0,0
234,8.857534,0.444776,1,-0.176471,-0.001718,0.0,0,0


In [72]:
print("Testing Data Preview:")
display(X_test.head(10))

Testing Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,PlayerGrowthRate2W,FollowersGrowthRate2W,PositiveRateGrowthRate2W,DLC_since_last_discount,Sequel_since_last_discount
285,3.526027,0.969657,1,0.045287,0.006364,3.9e-05,0,0
281,3.090411,0.970458,0,0.051584,0.002086,-5.7e-05,0,0
33,3.260274,0.815636,1,-0.210916,0.001681,0.00124,0,0
211,8.178082,0.982537,0,-0.176385,0.006844,-0.000321,0,0
93,6.583562,0.955034,1,-0.721785,0.001203,2.4e-05,0,0
84,5.167123,0.954422,1,0.028463,0.000875,3.3e-05,0,0
391,3.10137,0.953009,1,-0.107192,0.004042,7.5e-05,0,0
94,6.643836,0.955118,1,-0.348077,0.00118,6e-05,0,0
225,7.824658,0.45122,0,-0.324636,0.0,0.003049,0,0
126,5.750685,0.884745,1,-0.18492,0.001139,6.3e-05,0,0


#### The actual modeling starts below

In [73]:
# 建立線性迴歸模型
model = LinearRegression()

# 使用訓練資料進行模型擬合
model.fit(X_train, y_train)

# 輸出模型截距與係數
print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")

Intercept: 0.532271587161748
Coefficients:
  Age: 0.0043
  AccumulatedPositiveRate: 0.0325
  SalePeriod: -0.0361
  PlayerGrowthRate2W: -0.0000
  FollowersGrowthRate2W: -2.4543
  PositiveRateGrowthRate2W: -17.2003
  DLC_since_last_discount: 0.0136
  Sequel_since_last_discount: 0.1550


In [74]:
# 訓練資料預測
y_train_pred = model.predict(X_train)

# 評估模型表現
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Training Performance:")
print(f"  R²: {r2_train:.4f}")
print(f"  RMSE: {rmse_train:.4f}")

Training Performance:
  R²: 0.0299
  RMSE: 0.1873


In [75]:
# 測試資料預測
y_test_pred = model.predict(X_test)

# 評估模型表現
r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Testing Performance:")
print(f"  R²: {r2_test:.4f}")
print(f"  RMSE: {rmse_test:.4f}")

Testing Performance:
  R²: 0.0569
  RMSE: 0.1494


### - 1 month

In [76]:
# 讀取資料
df = pd.read_csv('../data/processed/extent-of-discount-rate-DE.csv')


# 定義特徵與目標變數
X = df[["Age", "AccumulatedPositiveRate", "SalePeriod",
        "PlayerGrowthRate1M", "FollowersGrowthRate1M",
        "PositiveRateGrowthRate1M", "DLC_since_last_discount",
        "Sequel_since_last_discount"]]
y = df["DiscountRate"]

# 切分訓練與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 顯示訓練資料前 10 筆
print("Training Data Preview:")
display(X_train.head(10))

Training Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,PlayerGrowthRate1M,FollowersGrowthRate1M,PositiveRateGrowthRate1M,DLC_since_last_discount,Sequel_since_last_discount
18,11.060274,0.941803,1,0.102063,0.012877,-1.6e-05,0,0
203,18.378082,0.967921,1,-0.04557,0.003205,3.9e-05,0,0
351,5.252055,0.851617,0,-0.088322,0.000775,8.6e-05,0,0
275,8.356164,0.94703,1,-0.021211,0.001693,-2.1e-05,0,0
63,4.564384,0.951567,1,-0.088474,0.013172,-3.3e-05,0,0
249,4.791781,0.967286,0,-0.165792,0.002303,0.000106,0,0
302,6.254795,0.749023,0,-0.09625,0.005604,0.000558,0,0
108,9.884932,0.92499,0,0.019425,0.011309,0.000313,0,0
90,6.079452,0.955034,0,-0.295558,0.001365,3e-05,0,0
234,8.857534,0.444776,1,-0.326531,-0.003431,-0.002985,0,0


In [77]:
print("Testing Data Preview:")
display(X_test.head(10))

Testing Data Preview:


Unnamed: 0,Age,AccumulatedPositiveRate,SalePeriod,PlayerGrowthRate1M,FollowersGrowthRate1M,PositiveRateGrowthRate1M,DLC_since_last_discount,Sequel_since_last_discount
285,3.526027,0.969657,1,-0.029158,0.012,5.2e-05,0,0
281,3.090411,0.970458,0,-0.16869,0.004481,4.1e-05,0,0
33,3.260274,0.815636,1,-0.046926,0.003939,0.002761,0,0
211,8.178082,0.982537,0,0.473536,0.01649,-0.000212,0,0
93,6.583562,0.955034,1,4.533607,0.007444,-3.2e-05,0,0
84,5.167123,0.954422,1,0.052761,0.001671,7.4e-05,0,0
391,3.10137,0.953009,1,-0.177216,0.007496,0.000187,0,0
94,6.643836,0.955118,1,-0.362682,0.005238,9.3e-05,0,0
225,7.824658,0.45122,0,0.066555,-0.001718,0.006803,0,0
126,5.750685,0.884745,1,-0.02664,0.002766,0.000235,0,0


#### The actual modeling starts below

In [78]:
# 建立線性迴歸模型
model = LinearRegression()

# 使用訓練資料進行模型擬合
model.fit(X_train, y_train)

# 輸出模型截距與係數
print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: {coef:.4f}")

Intercept: 0.5320855229899142
Coefficients:
  Age: 0.0042
  AccumulatedPositiveRate: 0.0361
  SalePeriod: -0.0351
  PlayerGrowthRate1M: -0.0001
  FollowersGrowthRate1M: -1.4883
  PositiveRateGrowthRate1M: -10.0715
  DLC_since_last_discount: 0.0138
  Sequel_since_last_discount: 0.1561


In [79]:
# 訓練資料預測
y_train_pred = model.predict(X_train)

# 評估模型表現
r2_train = r2_score(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("Training Performance:")
print(f"  R²: {r2_train:.4f}")
print(f"  RMSE: {rmse_train:.4f}")

Training Performance:
  R²: 0.0292
  RMSE: 0.1873


In [80]:
# 測試資料預測
y_test_pred = model.predict(X_test)

# 評估模型表現
r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Testing Performance:")
print(f"  R²: {r2_test:.4f}")
print(f"  RMSE: {rmse_test:.4f}")

Testing Performance:
  R²: 0.0594
  RMSE: 0.1492
