In [None]:
# 使用线性回归 和决策树回归预测学生期末成绩，并比较模型性能。数据集采用UCI机器学习库中的"学生表现数据集"。  数据文件：student-mat.csv
#
# 实现步骤：
# 1. 将数据加载到Pandas DataFrame
# 2. 分离特征(X)和目标变量(y)
# 3. 按80-20划分训练测试集(random_state=42)
# 4. 使用StandardScaler标准化特征
# 5. 训练以下模型：
#    - 线性回归
#    - 决策树回归
#    - 多项式回归(degree=2)
# 6. 计算各模型的MSE分数
# 7. 输出评估指标

In [1]:
# 1. 将数据加载到Pandas DataFrame
import pandas as pd
data = pd.read_csv("student-mat.csv", delimiter=";")
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [2]:
data.describe()         # 可以看到数据量级不同，所以需要标准化

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [3]:
# 调用此方法可以看到data是否有缺失值，来决定是否进行缺失值填充策略
# 还可以看到每列的数据的数据类型，来决定是否对某些列进行独热编码
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [4]:
# 2. 分离特征(X)和目标变量(y)
X, y = data.iloc[:, :-3], data.iloc[:, -1]          # 最后三列均为标签

## 一、基础版本，只取数据集的数值列，做对G3的预测

In [5]:
import numpy as np
X_num = X.select_dtypes(np.number)         # 这里只取数值列，做预测
print(X_num.shape)
print(y.shape)

(395, 13)
(395,)


In [6]:
# 3. 按80-20划分训练测试集(random_state=42)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42)
print(f"样本的百分之80为： {X_num.shape[0] * 0.8}")
print(f"样本的百分之20为： {X_num.shape[0] * 0.2}")
print(f"训练集数据的长度为：{len(X_train)}")
print(f"训练集标签的长度为：{len(y_train)}")
print(f"测试集样本的长度为：{len(X_test)}")
print(f"测试集标签的长度为：{len(y_test)}")

样本的百分之80为： 316.0
样本的百分之20为： 79.0
训练集数据的长度为：316
训练集标签的长度为：316
测试集样本的长度为：79
测试集标签的长度为：79


In [7]:
# 4. 使用StandardScaler标准化特征
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

# 创建标准化的流水线
sta_scal_pipelin = make_pipeline(StandardScaler())
preprocessing = ColumnTransformer([
    ("sta_pipeline", sta_scal_pipelin, make_column_selector(dtype_include=np.number))])

In [8]:
# 5. 训练以下模型：
#    - 线性回归
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)

In [9]:
#    - 决策树回归
from sklearn.tree import DecisionTreeRegressor
decision_tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor())
decision_tree_reg.fit(X_train, y_train)

In [10]:
#    - 多项式回归(degree=2)
from sklearn.preprocessing import PolynomialFeatures
poly_lin_reg = make_pipeline(preprocessing, PolynomialFeatures(degree=2), LinearRegression())
poly_lin_reg.fit(X_train, y_train)
poly_decision_tree_reg = make_pipeline(preprocessing, PolynomialFeatures(degree=2), LinearRegression())
poly_decision_tree_reg.fit(X_train, y_train)

In [11]:
# 6. 计算各模型的MSE 和 R2 分数
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# MSE
print(f"=================MSE 均方误差====================")
print(f"线性回归：{mean_squared_error(y_test, lin_reg.predict(X_test))}")
print(f"决策树回归：{mean_squared_error(y_test, decision_tree_reg.predict(X_test))}")
print(f"多项式线性回归：{mean_squared_error(y_test, poly_lin_reg.predict(X_test))}")
print(f"多项式决策树回归：{mean_squared_error(y_test, poly_decision_tree_reg.predict(X_test))}")

# R2分数
print(f"=================R2分数====================")
print(f"线性回归：{r2_score(y_test, lin_reg.predict(X_test))}")
print(f"决策树回归：{r2_score(y_test, decision_tree_reg.predict(X_test))}")
print(f"多项式线性回归：{r2_score(y_test, poly_lin_reg.predict(X_test))}")
print(f"多项式决策树回归：{r2_score(y_test, poly_decision_tree_reg.predict(X_test))}")

线性回归：18.089574993693358
决策树回归：31.27848101265823
多项式线性回归：29.143047419426544
多项式决策树回归：29.143047419426544
线性回归：0.11779891276497811
决策树回归：-0.5254039946238238
多项式线性回归：-0.4212621428487562
多项式决策树回归：-0.4212621428487562


## 二、进阶版本，考虑某些分类特征
6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)

14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)

15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)

21 higher - wants to take higher education (binary: yes or no)

27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)

30 absences - number of school absences (numeric: from 0 to 93)

In [12]:
# 使用原始的X和y重新分离数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X_train.info()      # 可以看到有316条，大约是训练集的百分之八十

<class 'pandas.core.frame.DataFrame'>
Index: 316 entries, 181 to 102
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      316 non-null    object
 1   sex         316 non-null    object
 2   age         316 non-null    int64 
 3   address     316 non-null    object
 4   famsize     316 non-null    object
 5   Pstatus     316 non-null    object
 6   Medu        316 non-null    int64 
 7   Fedu        316 non-null    int64 
 8   Mjob        316 non-null    object
 9   Fjob        316 non-null    object
 10  reason      316 non-null    object
 11  guardian    316 non-null    object
 12  traveltime  316 non-null    int64 
 13  studytime   316 non-null    int64 
 14  failures    316 non-null    int64 
 15  schoolsup   316 non-null    object
 16  famsup      316 non-null    object
 17  paid        316 non-null    object
 18  activities  316 non-null    object
 19  nursery     316 non-null    object
 20  higher      3

In [14]:
# 使用独热编码，将cat_attribute 指定的列进行编码
from sklearn.preprocessing import OneHotEncoder
cat_attribute = ['Pstatus', 'studytime', 'failures', 'higher', 'Dalc', 'absences']
one_hot_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
preprocessing_with_cat = ColumnTransformer([
    ("sta_scal", sta_scal_pipelin, make_column_selector(dtype_include=np.number)),
     ("cat", one_hot_pipeline, cat_attribute)
])

In [15]:
# 线性回归跑
lin_reg = make_pipeline(
    preprocessing_with_cat, LinearRegression()
)
lin_reg.fit(X_train, y_train)

In [16]:
# 决策树回归跑
decision_tree_reg = make_pipeline(
    preprocessing_with_cat, DecisionTreeRegressor()
)
decision_tree_reg.fit(X_train, y_train)

In [68]:
# 多项式特征, 线性回归
poly_lin_reg = make_pipeline(
    preprocessing_with_cat, PolynomialFeatures(degree=2), LinearRegression()
)
poly_lin_reg.fit(X_train, y_train)

# 多项式特征, 决策树回归
poly_decision_tree_reg = make_pipeline(
    preprocessing_with_cat, PolynomialFeatures(degree=2), DecisionTreeRegressor()
)
poly_decision_tree_reg.fit(X_train, y_train)

In [17]:
# 6. 计算各模型的MSE 和 R2 分数
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# MSE
print(f"=================MSE 均方误差====================")
print(f"线性回归：{mean_squared_error(y_test, lin_reg.predict(X_test))}")
print(f"决策树回归：{mean_squared_error(y_test, decision_tree_reg.predict(X_test))}")
print(f"多项式线性回归：{mean_squared_error(y_test, poly_lin_reg.predict(X_test))}")
print(f"多项式决策树回归：{mean_squared_error(y_test, poly_decision_tree_reg.predict(X_test))}")

# R2分数
print(f"=================R2分数====================")
print(f"线性回归：{r2_score(y_test, lin_reg.predict(X_test))}")
print(f"决策树回归：{r2_score(y_test, decision_tree_reg.predict(X_test))}")
print(f"多项式线性回归：{r2_score(y_test, poly_lin_reg.predict(X_test))}")
print(f"多项式决策树回归：{r2_score(y_test, poly_decision_tree_reg.predict(X_test))}")

线性回归：8.616756310602755e+19
决策树回归：23.860759493670887
多项式线性回归：29.143047419426544
多项式决策树回归：29.143047419426544
线性回归：-4.202261130127824e+18
决策树回归：-0.1636529865908165
多项式线性回归：-0.4212621428487562
多项式决策树回归：-0.4212621428487562
