<a href="https://colab.research.google.com/github/Kagiany/2025/blob/main/GenAI_ML_HW5_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2025 GENAI-ML-HW5
# Regression
In this assignment, you are expected to use linear regression and multi layer perceptron (MLP) model to predict the metacritic score on the famous game platform, Steam. For more information, please check the homework slide.
HW5 Slide Link :

https://docs.google.com/presentation/d/1ysys__L1HKLPV2LX0u-KMP0LD1XamhNCY_hq29k-I0A/edit?usp=sharing



## Check GPU Status

In [None]:
!nvidia-smi

Wed Oct 22 04:15:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Import Modules


In [None]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import matplotlib.pyplot as plt


# Set Random Seed

In [None]:
# ===== 設定亂數種子，可以固定結果 =====
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# TODO - 自行選擇亂數種子
set_seed(20251024)

# Download Dataset

In [1]:
!git clone https://github.com/murphy-cthsu/GENAI-ML-2025-HW5-Data.git
!mv GENAI-ML-2025-HW5-Data/*.csv .



Cloning into 'GENAI-ML-2025-HW5-Data'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 10 (delta 1), reused 10 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), 3.75 MiB | 10.34 MiB/s, done.
Resolving deltas: 100% (1/1), done.


# Preview Training Data

In [None]:
# ===== 載入訓練資料集 =====
train_df = pd.read_csv("train.csv")
# 預覽資料集
row_index = 0
row = train_df.iloc[row_index]
for col, val in row.items():
    if isinstance(val, str) and len(val) > 100:
        print(f"{col:25}: {val[:100]}...")   # truncate long text
    else:
        print(f"{col:25}: {val}")
# 數值與文字特徵
numeric_features = [c for c in train_df.select_dtypes(include=['number']).columns if c != 'metacritic_score']
print("All available numeric features :", numeric_features)
print("All available text features :", train_df.select_dtypes(include=['object']).columns.tolist())

# Load Data

In [None]:


# TODO(Boss baseline) - Feature Selection: 選擇適合的features進行訓練，可選features請參見csv檔欄位
# To check all available numeric features, uncomment the line below :
# print("All availbale numeric features :", train_df.select_dtypes(include=['number']).columns.tolist())
numeric_features = ['recommendations', 'positive', 'negative','price']

X_numeric = train_df[numeric_features].fillna(0).values
y = train_df['metacritic_score'].fillna(0).values.reshape(-1, 1)


# 預設只使用數值特徵
X = X_numeric

# TODO(Boss baseline) - Feature Selection: 使用文字欄位的Embedding捕捉特徵
"""
# 如果想要使用embedding，可以取消以下程式碼註解
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# 要進行embedding的文字欄位
text_columns = ['reviews','short_description']

embedder = SentenceTransformer('all-mpnet-base-v2')

def extract_embeddings(df, col):
    print(f"Embedding column: {col}")
    texts = df[col].fillna("").astype(str).tolist()
    emb = embedder.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
    return emb

def reduce_dim_with_pca(embeddings, n_components=64):
    pca = PCA(n_components=n_components)
    reduced = pca.fit_transform(embeddings)
    return reduced, pca

# 建立embeddings並用PCA降維
embeddings_reduced = []
pca_models = {}
for col in text_columns:
    emb = extract_embeddings(train_df, col)
    emb_reduced, pca_model = reduce_dim_with_pca(emb, n_components=100)
    embeddings_reduced.append(emb_reduced)
    pca_models[col] = pca_model

X_text_reduced = np.concatenate(embeddings_reduced, axis=1)

# 將數值特徵與文字特徵結合
X = np.concatenate([X_numeric, X_text_reduced], axis=1)
"""

# ===== 對資料進行標準化 =====
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(y)

# ===== 切分 train/valid set: 可以自行調整訓練集與驗證集的比例 =====
X_train, X_dev, y_train, y_dev = train_test_split(X_scaled, y_scaled, test_size=0.25, random_state=42)

# Dataset Class


In [None]:
# ===== 資料集 =====
class SteamDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

# Model Class

In [None]:
# TODO(Strong baseline) - 模型架構: 自行調整模型架構以提高模型能力
class LinearModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.fc(x)

class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=(128, 64), dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]), # 第一層：input_dim -> 128，後接ReLU，好處是能夠引入非線性，讓模型學習更複雜的模式
            nn.ReLU(),
            nn.Dropout(dropout), # 設置Dropout能夠防止overfitting
            nn.Linear(hidden_dims[0], hidden_dims[1]), # 中間層：128 -> 64，接ReLU
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[1], 1), # 輸出層：64 -> 1
        )

    def forward(self, x):
        return self.net(x)

# Hyperparameter


In [None]:
# TODO(Medium baseline) - Training Hyperparameters: 自行調整訓練超參數來改善模型表現
n_epochs = 1000          # 訓練回合數
learning_rate = 1e-5     # 學習率
batch_size = 128         # 每次訓練取多少樣本
weight_decay = 0         # L2 regularization強度，緩解overfitting。Useful Reference:https://medium.com/analytics-vidhya/deep-learning-basics-weight-decay-3c68eb4344e9
shuffle_data = True      # 是否在每個 epoch 打亂資料訓練


# Train

In [None]:

# ===== 用DataLoader讀取資料集 =====
train_set = SteamDataset(X_train, y_train)
dev_set = SteamDataset(X_dev, y_dev)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=shuffle_data)
dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=False)

# ===== Model / Optimizer / Loss =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LinearModel(input_dim=X_train.shape[1]).to(device)  # TODO(Strong baseline) - 也可以換成 MLPModel
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = nn.MSELoss()
# ===== 訓練迴圈 =====
train_loss_history = []
val_rmse_history = []
for epoch in tqdm(range(n_epochs), desc="Training Progress"):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        preds = model(x_batch)
        loss = loss_fn(preds, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    train_loss_history.append(avg_loss)
    # 驗證
    model.eval()
    preds_list, y_true_list = [], []
    with torch.no_grad():
        for x_batch, y_batch in dev_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            preds = model(x_batch)
            preds_list.append(preds.cpu().numpy())  # move back to CPU for numpy
            y_true_list.append(y_batch.cpu().numpy())
    y_pred_raw = np.vstack(preds_list)
    y_true_raw = np.vstack(y_true_list)

    y_pred_inv = target_scaler.inverse_transform(y_pred_raw)
    y_true_inv = target_scaler.inverse_transform(y_true_raw)

    rmse = np.sqrt(mean_squared_error(y_true_inv, y_pred_inv))
    val_rmse_history.append(rmse)

    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}: Train Loss={avg_loss:.4f}, Val RMSE={rmse:.4f}")
# ===== Plot Curves =====
epochs = np.arange(1, n_epochs + 1)

plt.figure()
plt.plot(epochs, train_loss_history, label="Train Loss")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.title("Training Loss Curve")
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

plt.figure()
plt.plot(epochs, val_rmse_history, label="Validation RMSE")
plt.xlabel("Epoch")
plt.ylabel("RMSE (on original scale)")
plt.title("Validation RMSE Curve")
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

# Inference on Test Dataset

In [None]:
# ===== 載入測試集 =====
test_df = pd.read_csv("test.csv")

# 處理測試集 numeric 特徵
X_test_num = test_df[numeric_features].fillna(0).values
X_test_combined = np.hstack([X_test_num])

# TODO: If sentence embedding used.
"""
# 如果有使用embedding，請取消以下程式碼註解
# 處理文字embedding
reviews_embed = embedder.encode(test_df["reviews"].fillna("").astype(str).tolist(),
                               batch_size=64, show_progress_bar=True, convert_to_numpy=True)
short_desc_embed = embedder.encode(test_df["short_description"].fillna("").astype(str).tolist(),
                                   batch_size=64, show_progress_bar=True, convert_to_numpy=True)

# 用train時的PCA來降維
reviews_embed_reduced = pca_models['reviews'].transform(reviews_embed)
short_desc_embed_reduced = pca_models['short_description'].transform(short_desc_embed)

# 合併numeric+embedding
X_test_combined = np.hstack([X_test_num, reviews_embed_reduced, short_desc_embed_reduced])
"""
# 用 train 的 scaler transform (跟 train 特徵維度一致)
X_test_scaled = scaler.transform(X_test_combined)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

# ===== 模型推論 =====
model.eval()
with torch.no_grad():
    preds_scaled = model(X_test_tensor).cpu().numpy()
    preds = target_scaler.inverse_transform(preds_scaled).squeeze()

submission = pd.DataFrame({
    "appid": test_df["appid"],
    "metacritic_score": np.round(preds).astype(int)
})
submission.to_csv("submission.csv", index=False)
print("Output saved to submission.csv")