In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")


In [11]:
df = pd.read_csv("data/stud.csv")

In [12]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [13]:
## 将数据分为自变量和应变量  数学成绩是我的预测值 其他的是自变量
X = df.drop(columns=["math_score"],axis = 1)
y = df["math_score"]

In [14]:
# 将数据分为数值型和分类特征
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

# 对数值型和分类特征进行预处理
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

# 对float/int 数据进行标准化 对分类数据进行独热编码
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

# 创建一个pipeline来处理数值和分类特征
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,cat_features),
        ("StandardScaler",numeric_transformer,num_features)
    ]
)

In [16]:
X = preprocessor.fit_transform(X)

In [19]:
X.shape

(1000, 19)

In [20]:
# 划分训练集和数据集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [21]:
X_train.shape

(800, 19)

In [22]:
## 定义一个函数来评估模型
def evaluate_model(true,predicted):
    print(f"MAE: {mean_absolute_error(true,predicted)}")
    print(f"MSE: {mean_squared_error(true,predicted)}")
    print(f"RMSE: {np.sqrt(mean_squared_error(true,predicted))}")
    print(f"R2: {r2_score(true,predicted)}")


In [25]:
models = {
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "SVR":SVR(),
    "CatBoostRegressor":CatBoostRegressor(verbose=0),
    "XGBRegressor":XGBRegressor(eval_metric="rmse",verbosity=0)
}

model_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # 预测
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # 评估模型
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    evaluate_model(y_train,y_train_pred)
    print("-----------------------------------")
    evaluate_model(y_test,y_test_pred)

    print("===================================")
    print("\n\n")

KNeighborsRegressor
MAE: 4.527
MSE: 32.6859
RMSE: 5.71715838507208
R2: 0.8550176780012468
-----------------------------------
MAE: 5.627999999999999
MSE: 52.6388
RMSE: 7.255260160738552
R2: 0.7836806685669011



DecisionTreeRegressor
MAE: 0.01875
MSE: 0.078125
RMSE: 0.2795084971874737
R2: 0.9996534669718089
-----------------------------------
MAE: 6.01
MSE: 60.71
RMSE: 7.791662210337407
R2: 0.7505120441327797



RandomForestRegressor
MAE: 1.8165354166666667
MSE: 5.214634135416666
RMSE: 2.283557342265936
R2: 0.9768698501394586
-----------------------------------
MAE: 4.643462500000001
MSE: 36.16784428125001
RMSE: 6.013970758263629
R2: 0.8513681183025367



AdaBoostRegressor
MAE: 4.765885519555345
MSE: 33.823823791816494
RMSE: 5.815825288969442
R2: 0.8499702773301568
-----------------------------------
MAE: 4.7214897580798185
MSE: 36.58607667563786
RMSE: 6.048642548178712
R2: 0.8496493908251314



LinearRegression
MAE: 4.266711846071957
MSE: 28.334870380648585
RMSE: 5.323050852720513
R2: