In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 设置随机种子确保结果可复现
np.random.seed(42)

In [3]:
def load_data(file_path):
    """
    从CSV文件加载数据
    假设文件包含'ri.ASI指标'特征列和'target'目标变量列
    """
    data = pd.read_csv(file_path)
    print(f"数据加载成功，共{data.shape[0]}行，{data.shape[1]}列")
    return data

In [4]:
def prepare_data(data):
    """准备数据"""
    X = data[['g1.ASI','h','l','o','c']].values
    y = data['judge'].values

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # 数据标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

In [5]:
def find_best_params(X_train, y_train):
    """使用网格搜索找到最优参数，同时防止过拟合"""
    # 优化的参数网格
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7],
        'min_samples_split': [5, 10, 15],
        'min_samples_leaf': [2, 4, 6],
        'max_features': ['sqrt'],
        'bootstrap': [True],
        'ccp_alpha': [0.001, 0.01]
    }

    # 创建随机森林分类器
    rf = RandomForestClassifier(random_state=42)

    # 网格搜索
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy',
        verbose=1
    )

    print("开始网格搜索...")
    grid_search.fit(X_train, y_train)

    print(f"最佳参数: {grid_search.best_params_}")
    print(f"最佳交叉验证准确率: {grid_search.best_score_:.4f}")

    # 可视化早停效果
    param_range = [50, 100, 150, 200, 250]
    train_scores, test_scores = validation_curve(
        rf, X_train, y_train,
        param_name="n_estimators",
        param_range=param_range,
        cv=5,
        scoring="accuracy",
        n_jobs=-1
    )

    plt.figure(figsize=(10, 6))
    plt.plot(param_range, np.mean(train_scores, axis=1), 'o-', color="r", label="训练集准确率")
    plt.plot(param_range, np.mean(test_scores, axis=1), 'o-', color="g", label="验证集准确率")
    plt.title("不同决策树数量的验证曲线")
    plt.xlabel("决策树数量")
    plt.ylabel("准确率")
    plt.legend()
    plt.tight_layout()
    plt.savefig('early_stopping_curve.png')
    plt.close()

    return grid_search.best_estimator_

In [6]:
def train_ensemble_model(X_train, y_train):
    """训练集成模型减少过拟合"""
    # 基础模型
    rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=42
    )

    lr = LogisticRegression(random_state=42)
    svc = SVC(probability=True, random_state=42)

    # 投票分类器
    ensemble = VotingClassifier(
        estimators=[('rf', rf), ('lr', lr), ('svc', svc)],
        voting='soft'
    )

    # 训练集成模型
    ensemble.fit(X_train, y_train)
    return ensemble

In [7]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    """评估模型性能"""
    # 预测
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # 计算准确率
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"训练集准确率: {train_accuracy:.4f}")
    print(f"测试集准确率: {test_accuracy:.4f}")

    # 检查过拟合
    overfit_diff = train_accuracy - test_accuracy
    print(f"过拟合程度(训练-测试): {overfit_diff:.4f}")

    # 打印分类报告
    print("分类报告:")
    print(classification_report(y_test, y_test_pred))

    # 绘制混淆矩阵
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('预测标签')
    plt.ylabel('真实标签')
    plt.title('混淆矩阵')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()

    # 交叉验证
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"交叉验证准确率: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

    # 特征重要性（如果模型支持）
    if hasattr(model, 'feature_importances_'):
        plt.figure(figsize=(8, 6))
        plt.bar(['g1.ASI'], model.feature_importances_)
        plt.title('特征重要性')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()

    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'cv_accuracy': np.mean(cv_scores),
        'overfit_diff': overfit_diff
    }

In [10]:
def main():
    plt.rcParams["font.family"] = ["SimHei"]  # 设置中文字体
    plt.rcParams["axes.unicode_minus"] = False  # 解决负号显示问题
    """主函数，执行完整的机器学习流程"""
    # 文件路径，用户需要替换为自己的数据文件路径
    file_path = './ASIHdata.csv'

    # 加载数据
    print("正在加载数据...")
    data = load_data(file_path)

    # 准备数据
    print("\n正在准备数据...")
    X_train, X_test, y_train, y_test = prepare_data(data)

    # 寻找最优参数
    print("\n正在寻找最优参数...")
    # best_model = find_best_params(X_train, y_train)

    # 或者直接使用集成模型
    best_model = train_ensemble_model(X_train, y_train)

    # 评估模型
    print("\n正在评估模型...")
    metrics = evaluate_model(best_model, X_train, X_test, y_train, y_test)

    # 输出结果
    print("\n模型性能汇总:")
    print(f"训练集准确率: {metrics['train_accuracy']:.4f}")
    print(f"测试集准确率: {metrics['test_accuracy']:.4f}")
    print(f"交叉验证准确率: {metrics['cv_accuracy']:.4f}")
    print(f"过拟合程度: {metrics['overfit_diff']:.4f}")

    # 判断是否过拟合
    if metrics['overfit_diff'] > 0.1:
        print("\n警告: 模型存在过拟合现象! 建议:")
        print("1. 增加训练数据")
        print("2. 进一步调整参数，减小模型复杂度")
        print("3. 使用集成学习方法")
    else:
        print("\n模型表现良好，过拟合风险较低")
if __name__ == "__main__":
    main()

正在加载数据...
数据加载成功，共3430行，8列

正在准备数据...

正在寻找最优参数...

正在评估模型...
训练集准确率: 0.5718
测试集准确率: 0.5029
过拟合程度(训练-测试): 0.0689
分类报告:
              precision    recall  f1-score   support

          -1       0.51      0.64      0.57       348
           1       0.49      0.36      0.42       338

    accuracy                           0.50       686
   macro avg       0.50      0.50      0.49       686
weighted avg       0.50      0.50      0.49       686

交叉验证准确率: 0.5127 ± 0.0173

模型性能汇总:
训练集准确率: 0.5718
测试集准确率: 0.5029
交叉验证准确率: 0.5127
过拟合程度: 0.0689

模型表现良好，过拟合风险较低
