### 在所有特征下各模型的性能

In [1]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from configs import COLUMNS, LABELS2IDS
from utils import *

# 设置随机种子
set_seed(2022)

  from pandas import MultiIndex, Int64Index


### 1. 读取数据和数据处理

In [2]:
# 读取预处理的数据
annotated_datas = pd.read_table('../annotated_datas.xls', names=COLUMNS)

# 获取数据
p_ids = annotated_datas['p_id'].values
sec_titles = annotated_datas['sec_title'].values
sec_subtitles = annotated_datas['sec_subtitle'].values
sec_texts = annotated_datas['sec_text'].values

# headers补充空值
sec_subtitles_new = []
for sec_subtitle, sec_title in zip(sec_subtitles, sec_titles):
    if str(sec_subtitle) == 'nan': 
        sec_subtitles_new.append(sec_title)
    else:
        sec_subtitles_new.append(sec_subtitle)
sec_subtitles = np.array(sec_subtitles_new)

dataset_ids = annotated_datas['dataset_id'].values.reshape(-1, 1)
jname_ids = annotated_datas['jname_id'].values.reshape(-1, 1)

bib_nums = annotated_datas['bib_num'].values.reshape(-1, 1)
fn_nums = annotated_datas['fn_num'].values.reshape(-1, 1)
fig_nums = annotated_datas['fig_num'].values.reshape(-1, 1)
tab_nums = annotated_datas['tab_num'].values.reshape(-1, 1)
equ_nums = annotated_datas['equ_num'].values.reshape(-1, 1)

para_nums = annotated_datas['para_num'].values.reshape(-1, 1)
sen_nums = annotated_datas['sen_num'].values.reshape(-1, 1)
word_nums = annotated_datas['word_num'].values.reshape(-1, 1)

sec_locs = annotated_datas['sec_loc'].values.reshape(-1, 1)  
        
ys = np.array([LABELS2IDS[anno_result] for anno_result in annotated_datas['label']])


from scipy.sparse import hstack

sec_titles_vec = preprocessing_titles(sec_titles, save_name='sec_title_words')
sec_subtitles_vec = preprocessing_titles(sec_subtitles, ys, feature_selection_approach = 'CHI', percentile=20, save_name='sec_subtitle_words')
sec_texts_vec = preprocessing_text(sec_texts, ys, feature_selection_approach = 'CHI', percentile=10, save_name='sec_text_words')
print("sec_titles_vec:", sec_titles_vec.shape)
print("sec_subtitles_vec:", sec_subtitles_vec.shape)
print("sec_texts_vec:", sec_texts_vec.shape)

features = [sec_titles_vec, sec_subtitles_vec, sec_texts_vec, 
             dataset_ids, jname_ids, bib_nums, fn_nums,
             fig_nums, tab_nums, equ_nums, para_nums, 
             sen_nums, word_nums, sec_locs]

save_datas = []
xs = hstack(features).toarray()
print(len(features), xs.shape, ys.shape)

sec_titles_vec: (7405, 1253)
sec_subtitles_vec: (7405, 1175)
sec_texts_vec: (7405, 1491)
14 (7405, 3930) (7405,)


## 2. 给章节文本选择合适的特征数量

In [3]:
# 算法1：朴素贝叶斯
print("*"*25, "NB", "*"*25)
model1 = MultinomialNB()
p, r, f1 = execute_model(model1, xs,  ys, dataset_ids, info="NB", is_output_dataset_results=True)
save_datas.append(["NB", p, r, f1])

# # 算法2：决策树
# print("*"*25, "DT", "*"*25)
# model2 = DecisionTreeClassifier(class_weight="balanced", random_state=2022)
# p, r, f1 = execute_model(model2, xs,  ys, dataset_ids, info="DT", is_output_dataset_results=True)
# save_datas.append(["DT", p, r, f1])

# # 算法3：K-近邻
# print("*"*25, "KNN", "*"*25)
# model3 = KNeighborsClassifier()
# p, r, f1 = execute_model(model3, xs,  ys, dataset_ids, info="KNN", is_output_dataset_results=True)
# save_datas.append(["KNN", p, r, f1])

# 算法4：逻辑斯蒂回归
print("*"*25, "LR", "*"*25)
model4 = LogisticRegression(class_weight='balanced', random_state=2022)
p, r, f1 = execute_model(model4, xs,  ys, dataset_ids, info="LR", is_output_dataset_results=True)
save_datas.append(["LR", p, r, f1])

# 算法5：支持向量机
print("*"*25, "SVM", "*"*25)
model5 = SVC(class_weight='balanced', random_state=2022)
p, r, f1 = execute_model(model5, xs,  ys, dataset_ids, info="SVM", is_output_dataset_results=True)
save_datas.append(["SVM", p, r, f1])

# # 算法6：多层感知机
# print("*"*25, "MLP", "*"*25)
# model6 = MLPClassifier(solver='adam', alpha=0.001, batch_size='auto',
# learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=500, shuffle=True,
# random_state=2022, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
# early_stopping=True, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
# p, r, f1 = execute_model(model6, xs,  ys, dataset_ids, info="MLP", is_output_dataset_results=True)
# save_datas.append(["MLP", p, r, f1])

# 算法7：随机森林量机
print("*"*25, "RF", "*"*25)
model7 = RandomForestClassifier(class_weight='balanced', random_state=2022)
p, r, f1 = execute_model(model7, xs,  ys, dataset_ids, info="RF", is_output_dataset_results=True)
save_datas.append(["RF", p, r, f1])

# # 算法8：ADA
# print("*"*25, "ADA", "*"*25)
# model8 = AdaBoostClassifier(random_state=2022)
# p, r, f1 = execute_model(model8, xs,  ys, dataset_ids, info="ADA")
# save_datas.append(["ADA", p, r, f1])

# # 算法9：xgboost
# print("*"*25, "XGB", "*"*25)
# model9 = XGBClassifier(seed=2022, use_label_encoder=False, eval_metric=['logloss','auc','error'])
# p, r, f1 = execute_model(model9, xs,  ys, dataset_ids, info="XGB")
# save_datas.append(["XGB", p, r, f1])

print("*"*50)

************************* NB *************************
--------------------------------------------------
[[66.53 79.92 66.48]
 [90.61 98.59 93.2 ]
 [71.83 87.79 74.65]
 [85.98 98.98 89.15]
 [65.57 64.14 64.84]]
INFO: NB, PMC->P:76.1, R:85.88, F1:77.66
----------------------------------------
[[75.45 72.86 73.53]
 [72.78 68.93 70.4 ]
 [74.64 88.09 73.86]
 [74.33 70.57 71.91]
 [76.22 73.1  74.21]]
INFO: NB, LIS->P:74.68, R:74.71, F1:72.78
----------------------------------------
[[79.83 90.8  77.48]
 [78.49 74.82 76.55]
 [77.1  75.01 76.  ]
 [79.02 75.93 77.42]
 [80.   91.61 78.18]]
INFO: NB, IEEE->P:78.89, R:81.63, F1:77.13
----------------------------------------
[[77.87 86.13 76.98]
 [76.53 84.91 75.8 ]
 [76.37 85.4  76.2 ]
 [77.44 92.23 76.99]
 [78.17 83.71 77.11]]
INFO: NB, ALL->P:77.28, R:86.48, F1:76.62
----------------------------------------
[[98.53918472 97.57195219 98.05287886]
 [78.94901087 84.9581801  81.82528626]
 [93.59425599 82.10115589 87.45933116]
 [92.54443694 88.0566

In [4]:
save_datas

[['NB', 77.28, 86.48, 76.62],
 ['LR', 85.73, 88.52, 85.95],
 ['SVM', 89.68, 80.04, 82.42],
 ['RF', 93.75, 87.54, 89.39]]

In [6]:
#  # 算法5：支持向量机
# print("*"*25, "SVM", "*"*25)
# model5 = SVC(kernel='linear', class_weight='balanced', random_state=2022)
# p, r, f1 = execute_model(model5, xs,  ys, dataset_ids, info="SVM", is_output_dataset_results=True)
# save_datas.append(["SVM", p, r, f1])

In [4]:
# 算法7：随机森林量机
print("*"*25, "RF", "*"*25)
model7 = RandomForestClassifier(class_weight='balanced', random_state=2022)
p, r, f1 = execute_model(model7, xs,  ys, dataset_ids, info="RF", is_output_dataset_results=True)
save_datas.append(["RF", p, r, f1])


# 算法9：xgboost
print("*"*25, "XGB", "*"*25)
model9 = XGBClassifier(seed=2022, use_label_encoder=False, eval_metric=['logloss','auc','error'])
p, r, f1 = execute_model(model9, xs,  ys, dataset_ids, info="XGB", is_output_dataset_results=True)
save_datas.append(["XGB", p, r, f1])

************************* RF *************************
--------------------------------------------------
[[91.24 99.45 93.95]
 [82.96 83.15 83.05]
 [82.06 74.13 76.7 ]
 [99.65 99.69 99.67]
 [65.88 66.37 66.12]]
INFO: RF, PMC->P:84.36, R:84.56, F1:83.9
----------------------------------------
[[79.24 79.26 79.23]
 [77.67 77.64 77.62]
 [79.2  79.31 79.25]
 [94.01 93.28 93.53]
 [94.01 93.86 93.9 ]]
INFO: RF, LIS->P:84.83, R:84.67, F1:84.71
----------------------------------------
[[81.12 79.84 80.34]
 [98.18 96.04 96.94]
 [96.63 95.6  96.02]
 [96.89 95.03 95.83]
 [81.53 80.88 81.16]]
INFO: RF, IEEE->P:90.87, R:89.48, F1:90.06
----------------------------------------
[[97.39 85.76 88.77]
 [96.85 90.47 93.06]
 [96.9  85.4  88.35]
 [97.04 96.07 96.5 ]
 [80.62 80.12 80.34]]
INFO: RF, ALL->P:93.76, R:87.56, F1:89.4
----------------------------------------
[[99.60415729 98.82442708 99.21144105]
 [95.63989543 89.2767044  92.34296569]
 [92.20928304 97.58759297 94.82082562]
 [96.2616519  95.16418