In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## 处理数据

In [11]:
file_path='./附件1/回归预测.xlsx'
train_data=pd.read_excel(file_path,sheet_name='训练集')
test_data=pd.read_excel(file_path,sheet_name='测试集')
print(f"训练集形状: {train_data.shape}")
print(f"测试集形状: {test_data.shape}")

X_train = train_data.iloc[:, :31]# 前31列作为特征
y_train = train_data.iloc[:, 31]# 第32列作为目标

X_test = test_data.iloc[:, :31]    
y_test = test_data.iloc[:, 31]     


训练集形状: (549, 32)
测试集形状: (136, 32)


In [20]:
print(X_train.iloc[:,30])

0       奥氮平
1      氟哌啶醇
2       奥氮平
3      齐拉西酮
4      氟哌啶醇
       ... 
544    齐拉西酮
545     利培酮
546     利培酮
547    阿立哌唑
548     利培酮
Name: 奥氮平, Length: 549, dtype: object


In [21]:
# 数据有数值型和分类型，需要预处理
def create_preprocessor():# 创建数据预处理器
    numerical_features = list(range(30))# 前30列: 治疗前指标分数 (数值型)
    categorical_features = [30]# 第31列: 所服药物 (分类型)
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
        ]
    )
    return preprocessor

## 使用随机森林作为集成学习模型

In [None]:
preprocessor=create_preprocessor()

rf=RandomForestRegressor(random_state=42, n_jobs=-1)

# 设置随机森林参数网格
param_grid={
    'randomforestregressor__n_estimators': [100, 200, 300],
    'randomforestregressor__max_depth': [5, 10, 15, None],
    'randomforestregressor__min_samples_split': [5, 10, 15],
    'randomforestregressor__min_samples_leaf': [2, 5, 10],
    'randomforestregressor__max_features': ['sqrt', 'log2', 0.8,None]
}

pipeline=Pipeline([
    ('preprocessor', preprocessor),
    ('randomforestregressor', rf)
])

# 交叉验证找最佳超参
grid_search=GridSearchCV(
    pipeline, param_grid, cv=5, scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)

X_train_array = X_train.values
grid_search.fit(X_train_array, y_train)

print(f"随机森林最佳参数: {grid_search.best_params_}")
print(f"随机森林最佳CV分数: {-grid_search.best_score_:.4f}")
    
model=grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
随机森林最佳参数: {'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 0.5, 'randomforestregressor__min_samples_leaf': 10, 'randomforestregressor__min_samples_split': 20, 'randomforestregressor__n_estimators': 200}
随机森林最佳CV分数: 321.8774


## 评估模型

In [None]:
X_test_array = X_test.values
y_pred=model.predict(X_test_array)
    
# 计算误差
squared_errors=(y_test-y_pred) ** 2
epsilon=1e-8
relative_errors=np.abs(y_test - y_pred) / (np.abs(y_test) + epsilon)

mean_squared_error_val=np.mean(squared_errors)
var_squared_error=np.var(squared_errors)
mean_relative_error=np.mean(relative_errors)
var_relative_error=np.var(relative_errors)

print("测试集性能评估结果:")
print(f"平方误差:  均值: {mean_squared_error_val:.6f}, 方差: {var_squared_error:.6f}")
print(f"相对误差:  均值: {mean_relative_error:.6f}, 方差: {var_relative_error:.6f}")

测试集性能评估结果:
平方误差:  均值: 273.397171, 方差: 274442.729314
相对误差:  均值: 0.219137, 方差: 0.034177
