# 1. 讀取檔案

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# 使用 pandas 的 read_csv 函式讀取訓練資料
df = pd.read_csv("data/train.csv")

# 2.資料預處理

In [61]:
# 直接刪除任何含有缺失值的整行資料
# HINT：有更好的預處理填補方式嗎？
df_clean = df[['mpg', 'weight', 'acceleration', 'model_year', 'cylinders', 'displacement', 'horsepower']]
df_clean.info()
df_clean = df_clean.fillna(df_clean.mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           318 non-null    float64
 1   weight        318 non-null    int64  
 2   acceleration  318 non-null    float64
 3   model_year    318 non-null    int64  
 4   cylinders     318 non-null    int64  
 5   displacement  318 non-null    float64
 6   horsepower    313 non-null    float64
dtypes: float64(4), int64(3)
memory usage: 17.5 KB


# 3.特徵工程

In [62]:
# 定義要用來預測的特徵欄位
# HINT：這串列遺漏了部分特徵，另外，有方法額外加上新的特徵嗎？
features = ['weight', 'acceleration', 'model_year', 'cylinders', 'displacement', 'horsepower']
# 定義我們要預測的目標欄位
target = 'mpg'

# 從乾淨的資料中選取 X 和 y
X = df_clean[features]
y = df_clean[target]

# 確保 interactions 檢查欄位存在（避免 KeyError）
def add_interactions(X):
    X = X.copy()
    # 用 get 保險地讀欄位
    X['hp_per_weight'] = X.get('horsepower', 0) / (X.get('weight', 1) + 1e-6)
    X['disp_per_cyl'] = X.get('displacement', 0) / (X.get('cylinders', 1) + 1e-6)
    X['car_age'] = 82 - X.get('model_year', 82)
    return X

# features 要與後續 pipeline 一致
numeric_features = ['weight', 'acceleration', 'model_year', 'displacement', 'horsepower', 'cylinders']
categorical_features = []  # 若 'cylinders' 視為數值，可放在 numeric_features；若視為類別，放到這裡

pre_num = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

pre_cat = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', pre_num, numeric_features),
    # ('cat', pre_cat, categorical_features),
])

full_pipeline = Pipeline([
    ('feature_add', FunctionTransformer(add_interactions)),
    ('preproc', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

# 交叉驗證評估
scores = -cross_val_score(full_pipeline, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
print("CV RMSE:", np.sqrt(scores).mean())
# 最終 fit
full_pipeline.fit(X, y)


CV RMSE: 3.0698013262938


0,1,2
,steps,"[('feature_add', ...), ('preproc', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function add...x7128fa38c4a0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# 4. 訓練模型

In [63]:
# 分割訓練集與測試集 (用於本地評估模型好壞)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立並訓練模型
model = LinearRegression()
model.fit(X_train, y_train)

# 進行預測與評估
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

print("模型已訓練完成！")
print(f"訓練誤差 (Train RMSE): {train_rmse:.4f} MPG")
print(f"測試誤差 (Test RMSE):  {test_rmse:.4f} MPG")
print("\n--- 模型學到的關係 ---")

# 各個特徵的權重
for feature, coef in zip(features, model.coef_):
    print(f"特徵 '{feature}' 的權重: {coef:.4f}")

模型已訓練完成！
訓練誤差 (Train RMSE): 3.4523 MPG
測試誤差 (Test RMSE):  3.7481 MPG

--- 模型學到的關係 ---
特徵 'weight' 的權重: -0.0066
特徵 'acceleration' 的權重: 0.0180
特徵 'model_year' 的權重: 0.8090
特徵 'cylinders' 的權重: 0.0119
特徵 'displacement' 的權重: 0.0038
特徵 'horsepower' 的權重: -0.0130


# 5.輸出提交檔案

In [64]:
# 讀取需要進行預測的測試檔案 test.csv
df_test = pd.read_csv("data/test.csv")

# 對測試資料進行預處理
# HINT：如果前面使用了其他的預處理方式，這邊要如何修改？
df_test = df_test.fillna(0)

# 使用訓練好的模型，對測試資料進行預測
predictions = full_pipeline.predict(df_test[features])

# 建立一個新的 DataFrame
submission_df = pd.DataFrame({'Id':df_test['id'], 'mpg': predictions})

# 保存為 submission.csv
submission_df.to_csv('submission.csv', index=False)
print("提交文件 'submission.csv' 已成功生成！")

提交文件 'submission.csv' 已成功生成！


# 6. 報告

姓名：__________ 學號：__________

第一部分：準確度分數 (Accuracy Scores) (1分)  
我的準確度分數：__________  

第二部分：我的實驗記錄 (My Experiment Log) (3分)  
請記錄你做了哪些嘗試來提升分數，至少記錄兩次不同的嘗試。  
【實驗 1】  
    我做的修改：__________________________________________________________________________________  
    結果與觀察 (分數變化、心得等)：__________________________________________________________________  
    該次實驗分數： ____________  
【實驗 2】  
    我做的修改：__________________________________________________________________________________  
    結果與觀察 (分數變化、心得等)：__________________________________________________________________  
    該次實驗分數： ____________  

第三部分：總結與心得 (Conclusion & Reflection) (2分)  
請撰寫一段約 50-100 字的心得總結。內容需包含：  
(1) 你認為本次實驗中，提升準確率最有效的修改是什麼。  
(2) 這次不斷嘗試與修正的過程，帶給你最大的學習與啟發。  
內容：______________________________________________    