导入所需的库

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, recall_score
import pydotplus
import os
os.environ["PATH"] += os.pathsep + '/opt/homebrew/Cellar/graphviz/12.1.2/bin'
# 在代码中显式指定 Graphviz 的路径

In [4]:
data = pd.read_csv('student_data_DT2.csv') #读

划分数据集

In [8]:
y = data['isPassExam'].values
X = data.drop(['isPassExam'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

构建决策树模型并尝试不同参数：网格搜索

In [11]:
# 尝试不同的最大深度
for max_depth in range(2, 15):
    clf = DecisionTreeClassifier(max_depth=max_depth)
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(f'max_depth: {max_depth}, train score: {train_score}, test score: {test_score}')

max_depth: 2, train score: 1.0, test score: 1.0
max_depth: 3, train score: 1.0, test score: 1.0
max_depth: 4, train score: 1.0, test score: 1.0
max_depth: 5, train score: 1.0, test score: 1.0
max_depth: 6, train score: 1.0, test score: 1.0
max_depth: 7, train score: 1.0, test score: 1.0
max_depth: 8, train score: 1.0, test score: 1.0
max_depth: 9, train score: 1.0, test score: 1.0
max_depth: 10, train score: 1.0, test score: 1.0
max_depth: 11, train score: 1.0, test score: 1.0
max_depth: 12, train score: 1.0, test score: 1.0
max_depth: 13, train score: 1.0, test score: 1.0
max_depth: 14, train score: 1.0, test score: 1.0


In [13]:
entropy_thresholds = np.linspace(0, 0.01, 50) #创建两个数组，分别用于表示在使用信息熵和基尼不纯度作为划分准则时的一系列阈值。
gini_thresholds = np.linspace(0, 0.005, 50)

param_grid = [{'criterion': ['entropy'], #创建一个参数网格param_grid，它是一个列表，包含了多个字典
               'min_impurity_decrease': entropy_thresholds},
              {'criterion': ['gini'], 
               'min_impurity_decrease': gini_thresholds},
              {'max_depth': range(2, 10)},
              {'min_samples_split': range(2, 30, 2)}]

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True)#创建一个GridSearchCV对象clf。
#第一个参数是要优化的模型，这里是一个未指定具体参数的DecisionTreeClassifier。
#第二个参数是参数网格param_grid，表示要搜索的参数空间。
#cv=5表示进行五折交叉验证。
#return_train_score=True表示在结果中返回训练集上的得分。
clf.fit(X, y)
print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_))

best param: {'criterion': 'entropy', 'min_impurity_decrease': 0.0}
best score: 0.9001514104209868


In [14]:
from sklearn.tree import DecisionTreeClassifier
import pydotplus

# 直接使用默认参数创建决策树分类器
clf = DecisionTreeClassifier()

# 或者手动指定一些参数
clf = DecisionTreeClassifier(max_depth=5, min_samples_split=2)

clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))

with open("students_dt_3.dot", 'w') as f:
    f = export_graphviz(clf, out_file=f)

dot_data = export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("students_dt_3.pdf")

train score: 1.0; test score: 1.0


True

In [16]:
labels = data.columns.tolist()
print(labels)

['Unnamed: 0', 'UserId', 'term', 'UserClass', 'grade', 'gender', 'majorClass', 'totalLearningTime', 'TimeClassNumber', 'LocationClassNumber', 'isPassExam']
