In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

# 确保结果目录存在
os.makedirs('../results', exist_ok=True)

def load_and_preprocess():
    data_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'data.xls')
    data = pd.read_excel(data_path)
    feature_names = data.drop(['id', 'diagnosis'], axis=1).columns  # 先保存列名
    X = data.drop(['id', 'diagnosis'], axis=1)
    y = data['diagnosis'].map({'B': 0, 'M': 1})
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, random_state=42
    )
    return X_train, X_test, y_train, y_test, feature_names  # 返回正确的列名

# 数据预处理
X_train, X_test, y_train, y_test, feature_names = load_and_preprocess()

# 训练模型
model = train_random_forest(X_train, y_train)



def train_random_forest(X_train, y_train):
    # 参数网格（可根据实验要求修改）
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, None]
    }
    
    # 网格搜索
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_

def plot_results(model, X_test, y_test, feature_names, save_path):
    # ROC曲线
    plt.figure(figsize=(10, 5))
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]):.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.savefig(f'{save_path}/roc_curve.png')
    plt.close()

# 可视化结果
plot_results(model, X_test, y_test, feature_names, '../results')
print("实验完成！结果已保存到 results 文件夹")