# 二分类模型

In [None]:
from sklearn.metrics import r2_score,average_precision_score,precision_recall_curve
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,accuracy_score

import xgboost
# XGBoost模型
xgb_model=xgboost.XGBClassifier(max_depth=5,
                        learning_rate=0.018,
                        n_estimators=500,
                        min_child_weight=0.6,
                        eta=0.1,
                        gamma=0.5,
                        reg_lambda=5,
                        subsample=0.8,
                        colsample_bytree=0.6,
                        nthread=4,
                        scale_pos_weight=1,
                        random_state=3)

xgb_model.fit(tran_x_sm,tran_y_sm)
xgb_predictions=xgb_model.predict(test_x)


import lightgbm
# LightGBM模型
lgbm_model=lightgbm.LGBMClassifier(iterations=300, 
                                  max_depth=8,
                                  min_child_weight=0.9,
                                  gamma=0.5,
                                   reg_lambda=5,
                                  subsample=0.4,
                                  learning_rate=0.2, 
                                  loss_function='CrossEntropy',
                                  random_state=3)
lgbm_model.fit(tran_x_sm,tran_y_sm)
lgbm_predictions=lgbm_model.predict(test_x)


import catboost
# CatBoost模型
cat_model=catboost.CatBoostClassifier(iterations=300, 
                                      learning_rate=0.2, 
                                      depth=6,
                                      l2_leaf_reg=2,
                                      subsample=1,
                                      loss_function='CrossEntropy',
                                      random_state=3)
cat_model.fit(tran_x_sm,tran_y_sm)
cat_predictions=cat_model.predict(test_x)


# 随机森林
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# 列出参数列表
tree_grid_parameter = {'n_estimators': list((10, 50, 100, 150, 200))}
# 进行参数的搜索组合
grid = GridSearchCV(RandomForestClassifier(), param_grid=tree_grid_parameter, cv=3)
# 根据已有数据去拟合随机森林模型
grid.fit(tran_x_sm, tran_y_sm)
rf_model = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'],
                            max_depth=8,
                            random_state=3)
rf_model.fit(tran_x_sm, tran_y_sm)
# 预测缺失值
rf_predictions = rf_model.predict(test_x)


# GBDT
# 列出参数列表
gbdt_model = GradientBoostingClassifier(n_estimators=300,
                            learning_rate=0.1,
                            max_depth=8,
                            subsample=0.4,
                            random_state=3)
gbdt_model.fit(tran_x_sm,tran_y_sm)
# 预测缺失值
gbdt_predictions = gbdt_model.predict(test_x)


# SVR
from sklearn.svm import SVR,SVC
# 回归模型
# svr = SVR(kernel='linear', C=1.25)
# 分类模型
svr_model = SVC(kernel='rbf',
          C=50,
          cache_size=200,
            probability=True,
          random_state=3)
svr_model.fit(tran_x_sm,tran_y_sm)
svr_predictions=svr_model.predict(test_x)


# Linear回归，Lasso回归，领回归，logistic回归
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,LogisticRegression
lcv_model = LogisticRegression(penalty='l2',
                         C=5,
                        solver='lbfgs',
                         max_iter=100,
                        random_state=3)
# lcv = Lasso()
# lcv = Ridge()
lcv_model.fit(tran_x_sm, tran_y_sm)
lcv_predictions = lcv_model.predict(test_x)

# ANN
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

ANN_model = MLPClassifier(alpha=0.1, 
                    hidden_layer_sizes=[100,], 
                    solver='adam', 
                    activation='relu', 
                    random_state=3)
ANN_model.fit(tran_x_sm, tran_y_sm)
ANN_predictions=ANN_model.predict(test_x)


# TabNet
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
TabNet_model = TabNetClassifier(n_d=8, 
                               n_a=8,
                               n_steps=3, # Number of steps in the architecture (usually between 3 and 10)
                               gamma=1.5,
                               n_independent=2)  #TabNetRegressor()
tran_x_x, tran_x_valid, tran_y_y, tran_y_valid = train_test_split(tran_x_sm, tran_y_sm, test_size=0.125, random_state=3)

TabNet_model.fit(X_train=tran_x_x.to_numpy(), 
        y_train=tran_y_y.to_numpy(), 
        eval_set=[(tran_x_valid.to_numpy(), tran_y_valid.to_numpy())], 
        eval_name=['train'], 
        eval_metric=['auc'],
        max_epochs=200,
        patience=50,
        batch_size=128,
        virtual_batch_size=14,
        num_workers=0,
        drop_last=False)

TabNet_predictions=TabNet_model.predict(test_x.to_numpy())

In [None]:
# 统一模型输出结果
df_model_result=pd.DataFrame(
    columns=['model','index','precision','recall','f1-score','support','accuracy','AUC','sensitivity','specificity'])

model_list=[xgb_model,lgbm_model,cat_model,rf_model,gbdt_model,svr_model,lcv_model,ANN_model,TabNet_model]
model_name_list=['XGBoost','LGBM','CatBoost','RF','GBDT','SVR','LR','ANN','TabNet']
for model,name in zip(model_list,model_name_list):
#     print(name)
    # 计算accuracy和AUC
    if name == 'TabNet':
        test_x=test_x.to_numpy()
    test_y_score=model.predict_proba(test_x)[:,-1]
    auc=roc_auc_score(test_y,test_y_score)
    auc=round(auc,4)
    accuracy=accuracy_score(test_y,model.predict(test_x))
    accuracy=round(accuracy,4)
    # 计算灵敏度sensitivity和特异度specificity
    # 计算灵敏度、特异度
    tn, fp, fn, tp = confusion_matrix(test_y,model.predict(test_x)).ravel()
    sensitivity=round(tp/(tp+fn),4)
    specificity=round(tn/(fp+tn),4)
    df_model_result.loc[df_model_result.shape[0],['model','accuracy','AUC','sensitivity','specificity']]=\
                                                              [name,accuracy,auc,sensitivity,specificity]
    # 并入二分类的P-R-f1
    # 提取classification_report结果
    report = classification_report(test_y, model.predict(test_x), output_dict=True)  # output_dict转化为字典类型
    df_report = pd.DataFrame(report).transpose()  # 转置
    df_report=df_report.apply(lambda x: round(x,4),axis=0)
    df_report=df_report.reset_index(drop=True)
    df_model_result=pd.concat([df_model_result,df_report.loc[0:1,:].reset_index()],axis=0)
    df_model_result=df_model_result.reset_index(drop=True)

In [None]:
df_model_result.rename(columns={'model':'',
                               'index':'label'},inplace=True)
# 保存模型测试效果
df_model_result.to_excel(project_path+'/data/df_模型测试效果.xlsx')

# 回归模型

In [None]:
from sklearn.metrics import r2_score,average_precision_score,precision_recall_curve
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,accuracy_score

import xgboost
# XGBoost模型
xgb_model=xgboost.XGBRegressor(max_depth=5,
                        learning_rate=0.018,
                        n_estimators=500,
                        min_child_weight=0.6,
                        eta=0.1,
                        gamma=0.5,
                        reg_lambda=5,
                        subsample=0.8,
                        colsample_bytree=0.6,
                        nthread=4,
                        scale_pos_weight=1,
                        random_state=3)

xgb_model.fit(tran_x,tran_y)
xgb_predictions=xgb_model.predict(test_x)


import lightgbm
# LightGBM模型
lgbm_model=lightgbm.LGBMRegressor(iterations=300, 
                                  max_depth=8,
                                  min_child_weight=0.9,
                                  gamma=0.5,
                                   reg_lambda=5,
                                  subsample=0.4,
                                  learning_rate=0.2, 
                                  loss_function='MAE',
                                  random_state=3)
lgbm_model.fit(tran_x,tran_y)
lgbm_predictions=lgbm_model.predict(test_x)

import catboost
# CatBoost模型
cat_model=catboost.CatBoostRegressor(iterations=300, 
                                      learning_rate=0.2, 
                                      depth=6,
                                      l2_leaf_reg=2,
                                      subsample=1,
                                      loss_function='MAE',
                                      random_state=3)
cat_model.fit(tran_x,tran_y)
cat_predictions=cat_model.predict(test_x)

# 随机森林
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
# 列出参数列表
tree_grid_parameter = {'n_estimators': list((10, 50, 100, 150, 200))}
# 进行参数的搜索组合
grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_grid_parameter, cv=3)
# 根据已有数据去拟合随机森林模型
grid.fit(tran_x, tran_y)
rf_model = RandomForestRegressor(n_estimators=grid.best_params_['n_estimators'],
                            max_depth=8,
                            random_state=3)
rf_model.fit(tran_x, tran_y)
# 预测缺失值
rf_predictions = rf_model.predict(test_x)

# GBDT
# 列出参数列表
gbdt_model = GradientBoostingRegressor(n_estimators=300,
                            learning_rate=0.1,
                            max_depth=8,
                            subsample=0.4,
                            random_state=3)
gbdt_model.fit(tran_x,tran_y)
# 预测缺失值
gbdt_predictions = gbdt_model.predict(test_x)

# SVR
from sklearn.svm import SVR,SVC
# 回归模型
# svr = SVR(kernel='linear', C=1.25)
# 分类模型
svr_model = SVR()
svr_model.fit(tran_x,tran_y)
svr_predictions=svr_model.predict(test_x)


# Linear回归，Lasso回归，领回归，logistic回归
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,LogisticRegression
# lcv_model = LogisticRegression()
lcv_model = Lasso()
# lcv = Ridge()
lcv_model.fit(tran_x, tran_y)
lcv_predictions = lcv_model.predict(test_x)


# ANN
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.metrics import classification_report,confusion_matrix

ANN_model = MLPRegressor(alpha=0.1, 
                    hidden_layer_sizes=[100,], 
                    solver='adam', 
                    activation='relu', 
                    random_state=3)
ANN_model.fit(tran_x, tran_y)
ANN_predictions=ANN_model.predict(test_x)

# TabNet
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
TabNet_model = TabNetRegressor(n_d=8, 
                               n_a=8,
                               n_steps=3, # Number of steps in the architecture (usually between 3 and 10)
                               gamma=1.5,
                               n_independent=2)  #TabNetRegressor()
tran_x_x, tran_x_valid, tran_y_y, tran_y_valid = train_test_split(tran_x, tran_y, test_size=0.125, random_state=3)

TabNet_model.fit(X_train=tran_x_x.to_numpy(), 
        y_train=tran_y_y.to_numpy().reshape(-1,1), 
        eval_set=[(tran_x_valid.to_numpy(), tran_y_valid.to_numpy().reshape(-1,1))], 
        eval_name=['train'], 
        eval_metric=['mae'],
        max_epochs=200,
        patience=50,
        batch_size=128,
        virtual_batch_size=14,
        num_workers=0,
        drop_last=False)

TabNet_predictions=TabNet_model.predict(test_x.to_numpy())

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
# 统一模型结果
df_model_result=pd.DataFrame(
    columns=['model','R2','RMSE','MAE','Accuracy within ± 10% range','Accuracy within ± 20% range','Accuracy within ± 30% range',
             'Accuracy within ± 40% range'])
model_list=[xgb_model,lgbm_model,cat_model,rf_model,gbdt_model,svr_model,lcv_model,ANN_model,TabNet_model]
model_name_list=['XGBoost','LGBM','CatBoost','RF','GBDT','SVR','LR','ANN','TabNet']
for model,name in zip(model_list,model_name_list):
#     print(name)
    # 计算R2、RMSE、MAE
    if name == 'TabNet':
        predictions=model.predict(test_x.to_numpy())
    else:
        predictions=model.predict(test_x)
    r2=r2_score(test_y,predictions)
    r2=round(r2,4)
    mae=mean_absolute_error(test_y,predictions)
    mae=round(mae,4)
    rmse=mean_squared_error(test_y,predictions) ** 0.5
    rmse=round(rmse,4)

    # 计算'Accuracy within ± 10%, 20%, 30%, 40% range'
    accuracy_10_list = [ (i,j) for i,j in zip(test_y,predictions) if abs((i-j)/i)<=0.1]
    accuracy_10_perc = round(len(accuracy_10_list)/len(test_y),4)
    accuracy_10_perc="%.2f%%" % (accuracy_10_perc * 100)      # 百分数输出

    accuracy_20_list = [ (i,j) for i,j in zip(test_y,predictions) if abs((i-j)/i)<=0.2]
    accuracy_20_perc = round(len(accuracy_20_list)/len(test_y),4)
    accuracy_20_perc="%.2f%%" % (accuracy_20_perc * 100)      # 百分数输出
    
    accuracy_30_list = [ (i,j) for i,j in zip(test_y,predictions) if abs((i-j)/i)<=0.3]
    accuracy_30_perc = round(len(accuracy_30_list)/len(test_y),4)
    accuracy_30_perc="%.2f%%" % (accuracy_30_perc * 100)      # 百分数输出
    
    accuracy_40_list = [ (i,j) for i,j in zip(test_y,predictions) if abs((i-j)/i)<=0.4]
    accuracy_40_perc = round(len(accuracy_40_list)/len(test_y),4)
    accuracy_40_perc="%.2f%%" % (accuracy_40_perc * 100)      # 百分数输出
    
    df_model_result.loc[df_model_result.shape[0],['model','R2','RMSE','MAE','Accuracy within ± 10% range', 
                                                  'Accuracy within ± 20% range','Accuracy within ± 30% range',
                                                 'Accuracy within ± 40% range']]=\
                                                  [name,r2,rmse,mae,accuracy_10_perc,accuracy_20_perc,accuracy_30_perc,
                                                  accuracy_40_perc]
    df_model_result=df_model_result.reset_index(drop=True)

In [None]:
df_model_result.rename(columns={'model':''},inplace=True)
# 保存模型测试效果
df_model_result.to_excel(project_path+'/data/df_模型测试效果.xlsx')