In [ ]:
import numpy as np
import pandas as pd

In [ ]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [ ]:
# 模型评估函数
def evaluate(X_test, y_test, classifier):
 
    y_pred = classifier.predict(X_test)
    
    # 1. 计算基本的性能指标
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='binary')  # 对于二分类问题，设置average='binary'
    precision = precision_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    print("Accuracy:", accuracy)
    print("Recall:", recall)
    print("Precision:", precision)
    print("F1 Score:", f1)

    # 2. 绘制混淆矩阵
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)

    # 使用sklearn的plot_confusion_matrix
    plot_confusion_matrix(classifier, X_test, y_test)  
    plt.show()

    # 3. ROC曲线和AUC
    # 注意：ROC和AUC仅适用于二分类问题
    # 计算模型的预测概率
    y_prob = classifier.predict_proba(X_test)[:, 1]  # 获取正类的概率
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
    plt.plot([0, 1], [0, 1], 'k--')  # 随机概率的对角线
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

    # 打印AUC分数
    print("AUC:", auc)


# 数据导入

In [ ]:
data_path = "/mnt/workspace/downloads/74387"
data = pd.read_csv(f"{data_path}/train_set.csv")

info=pd.read_table(f"{data_path}/%E6%95%B0%E6%8D%AE%E8%AF%B4%E6%98%8E.txt",sep='\t',index_col='NO')

In [ ]:
info

In [ ]:
data

In [ ]:
#TODO： 设置ID为索引
data =  ...
data

# 数据处理与划分
- 如何处理string类型的类别特征

In [ ]:
#TODO: 训练集、测试集划分
train = ...
test = ...
print("Size of training set: {}\nSize of test set: {}".format(len(train), len(test)))

In [ ]:
#TODO: 属性值、目标值划分
train_x, train_y =  ...
test_x, test_y = ...

In [ ]:
categorical_features = data.select_dtypes(include=['object']).columns
categorical_features

In [ ]:
data.info()

In [ ]:
#TODO: 针对类别特征的预处理
...

# 决策树

In [ ]:
from sklearn import tree

In [ ]:
#TODO: 初始化决策树
classifier = ...

In [ ]:
#TODO： 决策树拟合 
...

In [ ]:
#TODO：可视化决策树
tree.plot_tree(...)

In [ ]:
#TODO: 决策树性能评估
evaluate(...)

# 网格搜索

In [ ]:
from sklearn.model_selection import GridSearchCV

In [ ]:
#TODO: 定义超参数网格
param_grid = {
    'criterion': ['gini', 'entropy'],
    # 添加你认为重要的超参数
    ...
}

In [ ]:
# 创建决策树模型
dt = tree.DecisionTreeClassifier(random_state=2024)

In [ ]:
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, verbose=2, scoring='accuracy')

In [ ]:
#TODO: 
# 调用fit函数，完成不同超参数环境下的模型训练
...

In [ ]:
#TODO:
# 打印最佳超参数
grid_search...

In [ ]:
#TODO:
# 获取最佳模型
best_model = grid_search...

In [ ]:
#TODO: 评估最佳模型
evaluate(...)

# 集成学习

In [ ]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [ ]:
#TODO
#构建adaboost 分类器 
adaboost_classifier = ...

In [ ]:
#TODO
# adaboost模型拟合
adaboost_classifier...

In [ ]:
#TODO:
# adaboost 评估
evaluate(...)

In [ ]:
#TODO
#构建随机森林分类器
randomforest_classifier = ...

In [ ]:
#TODO
# 随机森林模型拟合
randomforest_classifier...

In [ ]:
#TODO
# 随机森林模型评估
evaluate(...)