In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import precision_recall_curve  
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score

#### 下面这个框不需要改动也不需要了解，是建立了"进度条"，为了显示后续"FOR循环"的进度

In [12]:
import sys
class ProgressBar():

    def __init__(self, max_steps):
        self.max_steps = max_steps
        self.current_step = 0
        self.progress_width = 50

    def update(self, step=None):
        self.current_step = step

        num_pass = int(self.current_step * self.progress_width / self.max_steps) + 1
        num_rest = self.progress_width - num_pass 
        percent = (self.current_step+1) * 100.0 / self.max_steps 
        progress_bar = '[' + '■' * (num_pass-1) + '▶' + '-' * num_rest + ']'
        progress_bar += '%.2f' % percent + '%' 
        if self.current_step < self.max_steps - 1:
            progress_bar += '\r' 
        else:
            progress_bar += '\n' 
        sys.stdout.write(progress_bar) 
        sys.stdout.flush()
        if self.current_step >= self.max_steps:
            self.current_step = 0
            print

#### 导入数据
#### 根据自己的路径

In [13]:
input_data = "C:\\Users\\iii\\Desktop\\data-y.xls"

####  第一行是读取出来
#### 第二行是插补“线性插补”缺失值 👉 存在缺失值是无法建模
#### 第三行是输出“列名”，即变量名称

In [14]:
data = pd.read_excel(input_data)
data = data.interpolate(method='linear')
print(data.columns)

Index(['性别', '男', '年龄', '年龄段', '政治面貌', '党员', '教育年限', '教育程度', '职业九类', '家庭平均年收入',
       '地区', '分区', '文化', '制度', '强监督媒体使用', '弱监督媒体使用', '强监督媒体信任', '弱监督媒体信任',
       '意识形态', '左', '右', 'Y1', 'Y2', 'Y3', 'Y4', 'Ya', 'Yb', 'Yc', 'Yd'],
      dtype='object')


#### 下面是构建新变量：①②乘法构建新变量 JH1和JH2  ③ 加法构建新变量👉二分类的Y

In [15]:
data["JH1"] = data['意识形态'] * data['强监督媒体使用']
data["JH2"] = data['意识形态'] * data['强监督媒体信任']
data['Y'] = data['Y1'] + data['Y2'] + data['Y3'] + data['Y4']
# print(data["JH2"])

#### independ_feature 👉 解释变量 X和D
#### depend_feature 👉 被解释变量 Y
#### 你可以根据自己的需要👉在里面添加、更换、删除变量

In [16]:
independ_feature = [ '男',  '年龄段', '政治面貌',  '教育程度', '职业九类', '家庭平均年收入',
       '地区', '分区', '文化', '制度', '强监督媒体使用',  '强监督媒体信任', '意识形态', 'JH1','JH2']

depend_feature = ['Yd']

#### 上面的步骤选定了所用变量，下面两行是把data根据解释变量、被解释变量分成两个部分

In [17]:
x = data[independ_feature]
y = data[depend_feature]

#### 各个模型，请仔细看注释

In [18]:
# 决策回归
# clf0 = DecisionTreeRegressor(criterion='mse',max_depth=4)

# 决策分类 👉 ①entropy是信息熵;可以换成gini②叶子节点最少的样本数=350,试值后发现这个大小精度好
clf_tree = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=350)


# 随机森林分类👉①同上②同上③相当于一个随机森林有"300"个树
clf_rf = RandomForestClassifier(criterion='entropy', min_samples_leaf=350, n_estimators=300)


# 支持向量机分类👉①核函数：linear 线性; rbf 径向基核函数
clf_svc = svm.SVC(kernel='linear')


# 神经网络👉基于多层感知机①solver有lbfgs,sgd,adam用来优化权重,论文可直接使用英文名字,无需详细论述 
#                          ②alpha是正则化参数(防止过拟合),1e-5即0.00001
#                          ③hidden_   =(10,15,5)的意思是,有“3”个隐藏层，每层神经元“10，15，5”个👉看着调吧,我论文也瞎试的...
#                          ④max_iter 迭代计算次数，一般几百就够了，太多了也不一定高。
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10,15, 5),random_state=1, max_iter=300)


# KNN👉①n+neighbors每“450”个算一个“邻近”范围，可调，我试了试，450左右还算可以，后期时间充裕再调
#       ②weights='distance'即 数据差距越大，越遥远。
#       ③leaf_size=30 防止过拟合，默认30 可加大
#       ④ p=2, metric='minkowski'即闵可夫斯基距离，常用，不建议改
clf_knn = KNeighborsClassifier(n_neighbors=450, weights='distance',
                           leaf_size=66, p=2, metric='minkowski')
                          
# 线性判别
# solver:👉svd：奇异值分解求解，无需计算协方差矩阵，适用于特征数量大情形；【其他】lsqr：最小平方QR分解；eigen：特征值分解；
clf_lda = LinearDiscriminantAnalysis(solver='svd')

### 为了方便多次试验：
#### 建立for循环：每次循环随即切割训练集和测试集；对每个模型进行回归，计算每一次误差并输出excel

In [19]:
# 这里建立误差列表👉方便后面输出到excel
re_list = list()
accuracy_list = list()
f1_list = list()
precision_list = list()
recall_list = list()
roc_auc_list = list()

# 循环多次试验
test_count = 100  # 👈 修改数字得到多次实验结果
progress_bar = ProgressBar(test_count)  # 进度条👉展示循环【总】进度，不需要了解


for test_num in range(0,test_count):
    progress_bar.update(test_num)  # 进度条【总】进度，不需要了解 
    
    # 划分👉0.2是测试集占比，可调
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)  # 👈 占比0.2 可调
    # 调整格式，便于计算，由原先的数据框→数列
    y_train = np.array(y_train).ravel()
    y_test = np.array(y_test).ravel()
    #各个模型：进行拟合
    clf_tree.fit(x_train,y_train)
    clf_rf.fit(x_train,y_train)
    clf_svc.fit(x_train,y_train)
    clf_mlp.fit(x_train,y_train)
    clf_knn.fit(x_train,y_train) 
    clf_lda.fit(x_train,y_train)
    
    #各个模型：预测集の预测结果
    predict_tree = clf_tree.predict(x_test)  #各个模型：预测结果
    predict_rf = clf_rf.predict(x_test)  #各个模型：预测结果        
    predict_svc = clf_svc.predict(x_test)  #各个模型：预测结果
    predict_mlp = clf_mlp.predict(x_test)  #各个模型：预测结果
    predict_knn = clf_knn.predict(x_test)  #各个模型：预测结果
    predict_lda = clf_lda.predict(x_test)  #各个模型：预测结果
    
    # 误差の RE 相对误差👉①（）是预测和真实的差②差不是0的总数③除以测试集样本量→RE
    re_tree = ((predict_tree - np.array(y_test).ravel()) != 0).astype(int).sum()/len(y_test)
    re_rf = ((predict_rf - np.array(y_test).ravel()) != 0).astype(int).sum()/len(y_test)
    re_svc = ((predict_svc - np.array(y_test).ravel()) != 0).astype(int).sum()/len(y_test) 
    re_mlp = ((predict_mlp - np.array(y_test).ravel()) != 0).astype(int).sum()/len(y_test)  
    re_knn = ((predict_knn - np.array(y_test).ravel()) != 0).astype(int).sum()/len(y_test)  
    re_lda = ((predict_lda - np.array(y_test).ravel()) != 0).astype(int).sum()/len(y_test)  
    """
    print("决策树误差:%.3f" % re_tree, "随机森林误差:%.3f" % re_rf, "支持向量机误差:%.3f" % re_svc,
          "神经网络误差:%.3f" % re_mlp, "KNN误差:%.3f" % re_knn, "线性判别误差:%.3f" % re_lda, sep="\n")
    """
    # 准确度 accuracy_score 
    accuracy_tree = accuracy_score(y_test, predict_tree)
    accuracy_rf = accuracy_score(y_test, predict_rf)
    accuracy_svc = accuracy_score(y_test, predict_svc)
    accuracy_mlp = accuracy_score(y_test, predict_mlp)
    accuracy_knn = accuracy_score(y_test, predict_knn)
    accuracy_lda = accuracy_score(y_test, predict_lda)
    # f1_score
    f1_tree = f1_score(y_test, predict_tree)
    f1_rf = f1_score(y_test, predict_rf)
    f1_svc = f1_score(y_test, predict_svc)
    f1_mlp = f1_score(y_test, predict_mlp)
    f1_knn = f1_score(y_test, predict_knn)
    f1_lda = f1_score(y_test, predict_lda)
    # 精确率👉会报“Warning”不要紧
    precision_tree = precision_score(y_test, predict_tree)
    precision_rf = precision_score(y_test, predict_rf)
    precision_svc = precision_score(y_test, predict_svc)
    precision_mlp = precision_score(y_test, predict_mlp)
    precision_knn = precision_score(y_test, predict_knn)
    precision_lda = precision_score(y_test, predict_lda)
    # 召回率
    recall_tree = recall_score(y_test, predict_tree)
    recall_rf = recall_score(y_test, predict_rf)
    recall_svc = recall_score(y_test, predict_svc)
    recall_mlp = recall_score(y_test, predict_mlp)
    recall_knn = recall_score(y_test, predict_knn)
    recall_lda = recall_score(y_test, predict_lda)
    
    # 计算ROC曲线的AUC值
    roc_auc_tree = roc_auc_score(y_test, predict_tree)
    roc_auc_rf = roc_auc_score(y_test, predict_rf)
    roc_auc_svc = roc_auc_score(y_test, predict_svc)
    roc_auc_mlp = roc_auc_score(y_test, predict_mlp)
    roc_auc_knn = roc_auc_score(y_test, predict_knn)
    roc_auc_lda = roc_auc_score(y_test, predict_lda)

    
    # 将结果存入“列表”
    re_list.append((re_tree, re_rf, re_svc, re_mlp, re_knn, re_lda))
    accuracy_list.append((accuracy_tree, accuracy_rf, accuracy_svc, accuracy_mlp, accuracy_knn, accuracy_lda))
    f1_list.append((f1_tree, f1_rf, f1_svc, f1_mlp, f1_knn, f1_lda))
    precision_list.append((precision_tree, precision_rf, precision_svc, precision_mlp, precision_knn, precision_lda))
    recall_list.append((recall_tree, recall_rf, recall_svc, recall_mlp, recall_knn, recall_lda))
    roc_auc_list.append((roc_auc_tree, roc_auc_rf, roc_auc_svc, roc_auc_mlp, roc_auc_knn, roc_auc_lda))

[■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■▶]100.00%


#### 上一步完成了多次循环实验，下面把存储好的误差输出到Excel

In [24]:
# 为了保存需要先转换格式
re_xls = pd.DataFrame(re_list)  # 相对误差
accuracy_xls = pd.DataFrame(accuracy_list)  # 准确率
f1_xls = pd.DataFrame(f1_list)  # F1
precision_xls = pd.DataFrame(precision_list)  # 精确率
recall_xls = pd.DataFrame(recall_list)  # 召回率
roc_auc_xls = pd.DataFrame(roc_auc_list)  # ROC曲线的AUC


# 修改列名，符合输出的误差定义
new_columns = ['决策树', "随机森林", "支持向量机", "神经网络", "KNN", "线性判别"] 
re_xls.columns = new_columns
accuracy_xls.columns = new_columns 
f1_xls.columns = new_columns
precision_xls.columns = new_columns
recall_xls.columns = new_columns 
roc_auc_xls.columns = new_columns

# 输出前三行，看看而已
print(roc_auc_xls.head(3))

        决策树  随机森林  支持向量机      神经网络       KNN      线性判别
0  0.546090   0.5    0.5  0.590215  0.510586  0.580588
1  0.550986   0.5    0.5  0.583497  0.503386  0.587550
2  0.552831   0.5    0.5  0.578821  0.503423  0.568552


## 最后一步:输出excel到指定位置，路径选择你需要的
#### 文件名不需要改(最后的"双斜杠\\"后の内容)👉已经按照“模型结果+被解释变量”的形式设置自动命名了

In [25]:
# 输出位置
output_file_path = 'C:\\Users\\iii\\Desktop\\模型预测结果%s.xls' % depend_feature[0]

In [26]:
# 合并不同方法为一个文件

sheet_name = ["相对误差", "准确率", "F1", "精确率", "召回率", "ROC的AUC"]
sheet_name_count = 0
writer = pd.ExcelWriter(output_file_path)
for xls_output in [re_xls, accuracy_xls, f1_xls, precision_xls, recall_xls, roc_auc_xls]:
    xls_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count])
    sheet_name_count += 1
writer.save()

# 结束廖！