In [5]:
'''
任务：
使用线性回归 和决策树回归预测学生期末成绩，并比较模型性能。数据集采用UCI机器学习库中的"学生表现数据集"。
数据文件：student-mat.csv
实现步骤：
1. 将数据加载到Pandas DataFrame
2. 分离特征(X)和目标变量(y)
3. 按80-20划分训练测试集(random_state=42)
4. 使用StandardScaler标准化特征
5. 训练以下模型：
   - 线性回归
   - 决策树回归
   - 多项式回归
6. 计算各模型的MSE分数
7. 输出评估指标
'''

# 数据读取
import pandas as pd
import numpy as np
original_student = pd.read_csv('/Users/paradice/PyCharmMiscProject/面试题/student-mat.csv', sep = ';')
original_student = original_student.drop(['G1', 'G2'], axis = 1)
pd.cut(original_student['G3'], 5)
original_X = original_student.drop(['G3'], axis = 1)    # 原始特征集
original_y = original_student['G3']                     # 原始标签集
original_y.head()

0     6
1     6
2    10
3    15
4    10
Name: G3, dtype: int64

In [10]:
'''
未通过管道封装的前期代码
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(original_X, original_y, test_size = 0.2, random_state = 42, shuffle = True)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
cat_encoder = OneHotEncoder()
X_train_cat = X_train[['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']]
X_train_num = X_train.drop(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic'], axis = 1)
X_train_cat_1hot = cat_encoder.fit_transform(X_train_cat)
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_train_num_scaled_df = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns)
from scipy import sparse
student_X_prepared = sparse.hstack([X_train_num_scaled_df,X_train_cat_1hot])
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor()
lin_reg.fit(student_X_prepared, y_train)
tree_reg.fit(student_X_prepared, y_train)
'''

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# 划分测试集
X_train, X_test, y_train, y_test = train_test_split(original_X, original_y, test_size = 0.2, random_state = 42, shuffle = True)


# 拆分字符串与数值特征
X_train_cat = X_train[['school', 'sex', 'address', 'famsize',
                       'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian',
                       'schoolsup', 'famsup', 'paid', 'activities',
                       'nursery', 'higher', 'internet', 'romantic']]

X_train_num = X_train.drop(['school', 'sex', 'address', 'famsize',
                            'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian',
                            'schoolsup', 'famsup', 'paid', 'activities',
                            'nursery', 'higher', 'internet', 'romantic'], axis = 1)


# 特征工程 多项式预处理
from sklearn.preprocessing import PolynomialFeatures
poly_sca_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree = 2, include_bias = False)),
    ('StandardScaler', StandardScaler())
])
num_attribs = list(X_train_num)
cat_attribs = ['school', 'sex', 'address', 'famsize',
               'Pstatus', 'Mjob', 'Fjob', 'reason',
               'guardian', 'schoolsup', 'famsup', 'paid',
               'activities', 'nursery', 'higher', 'internet', 'romantic']

poly_full_pipeline = ColumnTransformer([('polynum', poly_sca_pipeline,num_attribs ),
                                        ('cat', OneHotEncoder(), cat_attribs)])
student_X_poly_prepared = poly_full_pipeline.fit_transform(X_train)
student_X_test_poly_prepared = poly_full_pipeline.transform(X_test)

# 特征工程 数据归一化与独热编码
num_pipeline = Pipeline([
    ('StandardScaler', StandardScaler())
])
X_train_num_scaled = num_pipeline.fit_transform(X_train_num)

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs),
                                   ('cat', OneHotEncoder(), cat_attribs)])
student_X_prepared = full_pipeline.fit_transform(X_train)
student_X_test_prepared = full_pipeline.transform(X_test)


# 模型训练部分
model_Lin = LinearRegression()
model_poly_Lin = LinearRegression()
model_TreeReg = DecisionTreeRegressor()

model_Lin.fit(student_X_prepared, y_train)
model_poly_Lin.fit(student_X_poly_prepared, y_train)
grid_search_TreeReg = GridSearchCV(model_TreeReg, param_grid={'max_depth': [3, 5, 10, None]},  cv = 3, scoring = 'neg_mean_squared_error', return_train_score = True)

grid_search_TreeReg.fit(student_X_prepared, y_train)
tree_training_model = grid_search_TreeReg.best_estimator_


# 模型验证与得分
from sklearn.metrics import r2_score
y_pred_lin = model_Lin.predict(student_X_test_prepared)
y_pred_poly_lin = model_poly_Lin.predict(student_X_test_poly_prepared)
y_pred_tree = tree_training_model.predict(student_X_test_prepared)
mse_lin = mean_squared_error(y_test, y_pred_lin)
mse_poly_lin = mean_squared_error(y_test, y_pred_poly_lin)
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_lin = r2_score(y_test, y_pred_lin)
r2_poly_lin = r2_score(y_test, y_pred_poly_lin)
r2_tree = r2_score(y_test, y_pred_tree)
print('线性回归预测均方误差：', mse_lin, '\n'
      '线性回归预测r2得分:', r2_lin, '\n'
      '多项式线性回归预测均方误差:', mse_poly_lin, '\n'
      '多项式线性回归预测r2得分:', r2_poly_lin, '\n'
      '决策树回归预测均方误差:', mse_tree, '\n'
      '决策树回归预测均方r2得分:', r2_tree, '\n')

线性回归预测均方误差： 17.603516168232204 
线性回归预测r2得分: 0.14150326316743367 
多项式线性回归预测均方误差: 24.240166579620748 
多项式线性回归预测r2得分: -0.18215609370341213 
决策树回归预测均方误差: 19.470112974467874 
决策树回归预测均方r2得分: 0.05047217302492746 

