# README

- **Author**: `方珮潔`
- **Created At**: `2025-10-12`
- **Last Modified At**: `2025-10-12`

---

## What does this file do?

- `<Describe the first purpose of this file>`
- `<Describe the second purpose of this file>`
- `<Add more if needed>`

---

## What does this file take?

- **Source Data Sets**:  
  1. `/data/raw/<INPUT_FILE_NAME>` 
    - Description: `<What does this input file contain?>` 
  2. `<Add more input files if needed>`
  
---

## What does this file output?

- `/data/final/<OUTPUT_FILE_NAME>`  
  - Description: `<What does this output file contain?>`
- `<Add more input files if needed>`

# 程式碼

## 資料檢查與前處理

### 前置作業

In [None]:
# 後續所需套件引入
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 地端資料表讀取
merged_df = pd.read_csv("./all_games_merged.csv")
df = merged_df.copy()

### 資料表形狀與前五列

In [None]:
# 資料表形狀
print("\n--- Shape ---")
print(df.shape)

# 資料表前五列
print("\n--- Head 5 rows ---")
print(df.head())

### 資料型態與缺失值

In [None]:
# 資料型態與缺失狀況
print("\n--- Data information ---")
print(df.info())

In [None]:
# 特定欄位資料型態轉換
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['ReleaseDate'] = pd.to_datetime(df['ReleaseDate'], errors='coerce')
df['GameID'] = df['GameID'].astype(str)

# 剩餘數值欄位轉為 float
cat_cols = df.select_dtypes(include=['object', 'category']).columns
exclude_cols = list(cat_cols) + ['Date', 'ReleaseDate']
num_cols = df.columns.difference(exclude_cols)
for col in num_cols:
    try:
        df[col] = df[col].astype(float)
    except Exception as e:
        print(f"欄位 {col} 轉換失敗: {e}")

# 驗證型態轉換結果
print(df.info())

In [None]:
# 缺失值補值

### 數值欄位分佈

In [None]:
print("\n--- Numerical Summary ---")
print(df[num_cols].describe())

### 類別欄位分佈

In [None]:
print("\n--- Categorical Summary ---")
for col in cat_cols[:]:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts())

    plt.figure(figsize=(6, 4))
    sns.countplot(y=df[col], order=df[col].value_counts().index, palette="pastel")
    plt.title(f"Category Distribution of {col}")
    plt.xlabel("Count")
    plt.ylabel(col)
    plt.show()

## 進階欄位生成

### 折扣率

In [None]:
# 折扣率 = （原價 - 當下價格）/ 原價，因此為折扣率為正表示降價
df['DiscountRate'] = (df['OriginPrice'] - df['Final price']) / df['OriginPrice']

### 遊戲年齡

In [None]:
# 遊戲年齡（年） = 當下日期 - 推出日期
df['Age'] = (df['Date'] - df['ReleaseDate']).dt.days / 365

### 累積評價正面率、單日正負評價數

In [None]:
df['AccumulatedPositiveRate'] = df['Positive reviews'] / (df['Positive reviews'] - df['Negative reviews'])
df['Positive reviews on that day'] = df['Positive reviews'] - df['Positive reviews'].shift(1)
df['Negative reviews on that day'] = df['Negative reviews'] - df['Negative reviews'].shift(1)

### 是否為平台促銷期

In [None]:
# 建立促銷期間表
steam_sales = pd.DataFrame({
    'sale_name': [
        'Lunar New Year Sale 2023', 'Summer 2023', 'Autumn 2023', 'Winter 2023',
        'Spring 2024', 'Summer 2024', 'Autumn 2024', 'Winter 2024',
        'Spring 2025', 'Summer 2025'
    ],
    'start_date': pd.to_datetime([
        '2023-01-18', '2023-06-29', '2023-11-21', '2023-12-21',
        '2024-03-14', '2024-06-27', '2024-11-26', '2024-12-19',
        '2025-03-13', '2025-06-26'
    ]),
    'end_date': pd.to_datetime([
        '2023-01-27', '2023-07-13', '2023-11-28', '2024-01-04',
        '2024-03-21', '2024-07-11', '2024-12-03', '2025-01-02',
        '2025-03-20', '2025-07-10'
    ])
})

# 新增欄位，預設為 0
df['SalePeriod'] = 0

# 逐一比對促銷期間
for _, row in steam_sales.iterrows():
    mask = (df['Date'] >= row['start_date']) & (df['Date'] <= row['end_date'])
    df.loc[mask, 'SalePeriod'] = 1

### 玩家數、正面率的指定期間移動平均增長率

In [None]:
def process_all_games(df):
    def process_game(group):
        group = group.sort_values('Date')
        group.set_index('Date', inplace=True)
        
        # players 的移動平均
        group['AvgPlayers1W'] = group['Players'].rolling(window='7D').mean()
        group['AvgPlayers2W'] = group['Players'].rolling(window='14D').mean()
        group['AvgPlayers1M'] = group['Players'].rolling(window='30D').mean()
        # players MA 增長率
        group['PlayersGrowthRate1W'] = group['AvgPlayers1W'].pct_change(periods=7)
        group['PlayerGrowthRate2W'] = group['AvgPlayers2W'].pct_change(periods=14)
        group['PlayerGrowthRate1M'] = group['AvgPlayers1M'].pct_change(periods=30)  

        # positive reviews 
        group['PositiveReview1W'] = group['Positive reviews on that day'].rolling(window='7D').sum()
        group['NegativeReview1W'] = group['Negative reviews on that day'].rolling(window='7D').sum()
        group['PositiveReview2W'] = group['Positive reviews on that day'].rolling(window='14D').sum()
        group['NegativeReview2W'] = group['Negative reviews on that day'].rolling(window='14D').sum()
        group['PositiveReview1M'] = group['Positive reviews on that day'].rolling(window='30D').sum()
        group['NegativeReview1M'] = group['Negative reviews on that day'].rolling(window='30D').sum()
        # positive reviews 的移動平均
        group['PositiveRate1W'] = np.where(
            group['PositiveReview1W'] - group['NegativeReview1W'] == 0,
            0, 
            group['PositiveReview1W'] / (group['PositiveReview1W'] - group['NegativeReview1W'])
        )       
        group['PositiveRate2W'] = np.where(
            group['PositiveReview2W'] - group['NegativeReview2W'] == 0,
            0, 
            group['PositiveReview2W'] / (group['PositiveReview2W'] - group['NegativeReview2W'])
        )
        group['PositiveRate1M'] = np.where(
            group['PositiveReview1M'] - group['NegativeReview1M'] == 0,
            0, 
            group['PositiveReview1M'] / (group['PositiveReview1M'] - group['NegativeReview1M'])
        )
        # positive reviews MA 增長率
        group['PositiveRateGrowthRate1W'] = group['PositiveRate1W'].pct_change(periods=7)
        group['PositiveRateGrowthRate2W'] = group['PositiveRate2W'].pct_change(periods=14)
        group['PositiveRateGrowthRate1M'] = group['PositiveRate1M'].pct_change(periods=30)

        return group

    result = df.groupby('GameID', group_keys=False).apply(process_game).reset_index()
    return result

In [None]:
df_final = process_all_games(df)

### 欄位生成結果統整

In [None]:
# 確認欄位生成狀況
print(df_final.head())

In [None]:
# 確認總欄位數
print(df_final.columns)

In [None]:
# 確認數值欄位分布
df_final.describe()

In [None]:
# 刪除不必要欄位
df_final.drop(columns=['Players', 'Final price', 'Followers', 'Positive reviews', 'Negative reviews', 'ReleaseDate', 'OriginPrice', 'Positive reviews on that day', 'Negative reviews on that day', 'AvgPlayers1W', 'AvgPlayers2W', 'AvgPlayers1M', 'PositiveReview1W', 'NegativeReview1W', 'PositiveReview2W', 'NegativeReview2W', 'PositiveReview1M', 'NegativeReview1M'], inplace=True)

In [None]:
# 再確認總欄位數
print(df_final.columns)

## 將列改為折扣事件導向

In [None]:
# 依 GameID 與日期排序
df_final = df_final.sort_values(['GameID', 'Date']).reset_index(drop=True)

# 計算 DLC / Sequel 的累積總和
df_final['cumulative_DLC'] = df_final.groupby('GameID')['DLC'].cumsum()
df_final['cumulative_Sequel'] = df_final.groupby('GameID')['Sequel'].cumsum()

# 找出每款遊戲上一次折扣時的累積值
def calc_since_last_discount(group):
    # 儲存結果
    last_dlc = 0
    last_seq = 0
    dlc_since = []
    seq_since = []
    
    for _, row in group.iterrows():
        if row['DiscountRate'] != 0:
            # 遇到折扣，計算上次折扣以來增加多少
            dlc_since.append(row['cumulative_DLC'] - last_dlc)
            seq_since.append(row['cumulative_Sequel'] - last_seq)
            # 更新基準點
            last_dlc = row['cumulative_DLC']
            last_seq = row['cumulative_Sequel']
        else:
            dlc_since.append(None)
            seq_since.append(None)
    group['DLC_since_last_discount'] = dlc_since
    group['Sequel_since_last_discount'] = seq_since
    return group

# 套用函數到每款遊戲
df_final = df_final.groupby('GameID', group_keys=False).apply(calc_since_last_discount)

# 只保留折扣事件
df_result = df_final[df_final['DiscountRate'] != 0][
    list(df_final.columns)
].reset_index(drop=True)

## 輸出結果檔案

In [None]:
# 欄位順序調整

In [None]:
df_result.to_csv('processed_data.csv', index=False, encoding='utf-8-sig')