In [51]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import pandas as pd, numpy as np
import matplotlib as mpl
from sklearn.ensemble import RandomForestClassifier 
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib



In [3]:
# mpl.rcParams['font.sans-serif']=['SimHei']
# mpl.rcParams['axes.unicode_minus']=False
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']


In [64]:
def select_features(X_train, y_train, X_test):
	# configure to select a subset of features
	fs = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=20)
	# learn relationship from training data
	fs.fit(X_train, y_train)
	# transform train input data
	X_train_fs = fs.transform(X_train)
	# transform test input data
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs, fs

In [None]:
# fpath = r"/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/IterativeImputer-科技人员抑郁量表目标值分组.csv"
# fpath = r"/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/随机森林-科技人员抑郁量表目标值分组.csv"

# fpath = r"/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/IterativeImputer抑郁量表处理后数据.csv"
fpath = r"/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/IterativeImputer焦虑量表处理后数据.csv"
Dataset = pd.read_csv(fpath)
number = 30
# x = Dataset.loc[:,  "睡眠":"个人创新行为"]
x = Dataset.loc[:, "STPQpost1":"WORKTIME4"]
y = Dataset.loc[:, "anxiety"]
# y = Dataset.loc[:, "depression"]

feature_names = np.array(x.columns)

names = x.columns
names = list(names)
key = list(range(0, len(names)))
names_dict = dict(zip(key, names))
names_dicts = pd.DataFrame([names_dict])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=7)

"""
random forest model
max_depth:树的最大深度
"""
rf_model_without_selection = RandomForestClassifier(n_estimators = 100, random_state=7)
# rf_model_without_selection.fit(x_train, y_train)
#网格搜索调参
param_dist = {
        'n_estimators':range(80,200,4),
        'max_depth':range(2,15,1)
        }
rfCV = GridSearchCV(rf_model_without_selection,param_dist,cv=5)
rfCV.fit(x_train, y_train)
filename = "/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/model/anxiety_model.joblib"
joblib.dump(rfCV, filename)
print(rfCV.best_params_)
print("网格RF 精确度：",rfCV.score(x_test,y_test))
print("random forest 精确度：",rf_model_without_selection.score(x_test,y_test))

# sfm = SelectFromModel(rf_model_without_selection, max_features=number).fit(x_train, y_train)
# print("Features selected by SelectFromModel: "
#       f"{feature_names[sfm.get_support()]}")
# # feature selection
# X_train_fs, X_test_fs, fs = select_features(x_train, y_train, x_test)
# rf_model_selection = RandomForestClassifier(n_estimators = 100, random_state=7)
# rf_model_selection.fit(X_train_fs,y_train)
# yhat = rf_model_selection.predict(X_test_fs)
# # evaluate predictions
# accuracy = accuracy_score(y_test, yhat)
# print('Accuracy: %.2f' % (accuracy*100))

rf_feature_important = rf_model_without_selection.feature_importances_
rf_x_score = np.sort(rf_feature_important)[::-1]
rf_rank_idx  = np.argsort(rf_feature_important)[::-1]
rf_selected_rank_idx = rf_rank_idx[:number]
rf_selected_rank_names = names_dicts.loc[:, rf_selected_rank_idx]
rf_label = rf_selected_rank_names.values[0, :]

label = rf_selected_rank_names.values[0, :]
Datasetnew = Dataset[label]
# Datasetnew['depression'] = Dataset['depression']
# Datasetnew.to_csv("/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/科技人员抑郁量表-重要特征数据.csv",sep=',',index=False)
Datasetnew['anxiety'] = Dataset['anxiety']
Datasetnew.to_csv("/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/科技人员焦虑表-重要特征数据.csv",sep=',',index=False)

print("Random Forest选择的重要特征:",rf_label)

# 随机森林可视化
plt.figure(figsize=(8, 8))
plt.barh(rf_label[::-1], rf_x_score[:number][::-1], 0.6, align='center')
plt.grid(ls=':', color='gray', alpha=0.4)
plt.title("Random Forest Feature Importance")
plt.savefig("前"+str(number)+"名特征")
plt.show()

"""
xgb model
max_depth:树的最大深度
"""
# model = xgb.XGBRFClassifier(max_depth=10, learning_rate=0.16, n_estimators=100, min_child_weight=4)
# model.fit(x_train, y_train)
# print("XGB 精确度：",model.score(x_test,y_test))
# feature_important = model.feature_importances_
# rank_idx  = np.argsort(feature_important)[::-1]
# selected_rank_idx = rank_idx[:number]
# selected_rank_names = names_dicts.loc[:, selected_rank_idx]
# label = selected_rank_names.values[0, :]
# Datasetnew = Dataset[label]
# # Datasetnew['depression'] = Dataset['depression']
# Datasetnew['anxiety'] = Dataset['anxiety']
# # Datasetnew.to_csv("/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/科技人员抑郁量表-重要特征数据.csv",sep=',',index=False)
# Datasetnew.to_csv("/Users/wangxiaoyan/Documents/wy/论文/git/scientific-worker-mental-health/data/科技人员焦虑表-重要特征数据.csv",sep=',',index=False)

# print("XGB 选择的重要特征:",label)
# path1 = r"Xgboost排名前" + str(number) + "的特征.csv"
# pd.DataFrame(label).to_csv(path1, index=False)
# x_score = np.sort(feature_important)[::-1]
# path = r"Xgboost排名前" + str(number) + "的得分.csv"
# pd.DataFrame(x_score[:number]).to_csv(path, index=False)
# #网格搜索调参
# # param_dist = {
# #         'n_estimators':range(80,200,4),
# #         'max_depth':range(2,15,1),
# #         'learning_rate':[0.01,0.02,0.1]
# #         }
# # gsCv = GridSearchCV(model,param_dist,cv=5)

# # gsCv.fit(x_train, y_train)
# # print(gsCv.best_params_)
# # print("网格XGB 精确度：",gsCv.score(x_test,y_test))

# # cv_results = pd.DataFrame(gsCv.cv_results_)
# # path = r"paramRank.csv"
# # cv_results.to_csv(path, index=False)

# # # 可视化
# # plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
# # plt.xlabel("Feature")
# # plt.ylabel("Feature Score")
# # plt.title("Feature Importance")
# # plt.show()

# # 可视化
# plt.figure(figsize=(8, 8))
# plt.barh(label[::-1], x_score[:number][::-1], 0.6, align='center')
# plt.grid(ls=':', color='gray', alpha=0.4)
# plt.title("Xgboost Feature Importance")
# plt.savefig("前"+str(number)+"名特征")
# plt.show()

