### 1、导入本次实验需要的包

In [1]:
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression  
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import logging
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import expon, reciprocal
import sys


logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('training')
file_handler = logging.FileHandler('../../Result/ML/train.log') 
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

### 2、使用TF-IDF进行数据预处理

In [2]:
def data_process(file_name):
    data = pd.read_csv(file_name, encoding='utf-8')
    data = data[['answer', 'label']]
    data['answer'] = data['answer'].apply(lambda x: ' '.join(jieba.cut(x)))
    return data['answer'], data['label']

X_train, y_train = data_process("../../data/zh_doc_train.csv")
X_test, y_test = data_process("../../data/zh_doc_test.csv")
X_sent_test, y_sent_test = data_process('../../data/shuffled_zh_sent_test.csv')
X_concat_test, y_concat_test = pd.concat([X_test, X_sent_test], axis=0), pd.concat([y_test, y_sent_test], axis=0)

tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)
X_sent_test = tfidf_vectorizer.transform(X_sent_test)
X_concat_test = tfidf_vectorizer.transform(X_concat_test)

Building prefix dict from the default dictionary ...
2023-12-12 16:01:44,024 - jieba - DEBUG - Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2023-12-12 16:01:44,026 - jieba - DEBUG - Loading model from cache /tmp/jieba.cache
Loading model cost 0.613 seconds.
2023-12-12 16:01:44,639 - jieba - DEBUG - Loading model cost 0.613 seconds.
Prefix dict has been built successfully.
2023-12-12 16:01:44,641 - jieba - DEBUG - Prefix dict has been built successfully.


### 3、使用RandomizedSearchCV逐个进行调参

In [3]:
classifiers = [
        LogisticRegression(),
        GaussianNB(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        SVC(),
        RandomForestClassifier(),
        GradientBoostingClassifier(),
        XGBClassifier(),
        LGBMClassifier(),
        CatBoostClassifier()
    ]
# 使用f1评分
scorer = make_scorer(f1_score)

#### 4.1、逻辑回归

In [None]:
import warnings
warnings.filterwarnings('ignore')

param_distributions = {
    'C': reciprocal(0.1, 100),
    'solver': ['liblinear', 'newton-cg', 'lbfgs'],
    'penalty': ['l1', 'l2'],
    'max_iter': [1000]
}
random_search = RandomizedSearchCV(classifiers[0], param_distributions, n_iter=100, cv=5, scoring=scorer, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
logger.info(classifiers[0].__class__.__name__)
logger.info("最佳参数: %s", random_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", random_search.best_score_)


In [15]:
model = LogisticRegression(C=62.10, penalty='l2',solver='liblinear',max_iter=1000)
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

2023-12-12 15:59:30,854 - training - INFO - dataset doc F1分数: 0.9516055788355426
2023-12-12 15:59:30,981 - training - INFO - dataset sent F1分数: 0.790188538643597
2023-12-12 15:59:31,081 - training - INFO - dataset mix F1分数: 0.8186839651235414


#### 4.2、朴素贝叶斯

In [None]:
model = GaussianNB()
model.fit(X_train.toarray(), y_train)
eval_name = ['doc', 'sent', 'mix']
logger.info(classifiers[1].__class__.__name__)

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test.toarray())
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])


#### KNN

In [10]:
parameters = {
    # "n_neighbors" : range(1, 21, 2),
    # "n_neighbors" : range(19, 50, 2),
    "n_neighbors" : range(50, 100, 2),
}
grid_search = GridSearchCV(classifiers[2], parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
logger.info(classifiers[0].__class__.__name__)
logger.info("最佳参数: %s", grid_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", grid_search.best_score_)

2023-12-12 15:55:26,483 - training - INFO - LogisticRegression
2023-12-12 15:55:26,484 - training - INFO - 最佳参数: {'n_neighbors': 96}
2023-12-12 15:55:26,486 - training - INFO - 最佳模型分数（F1分数）: 0.7478112144870892


In [12]:
model = KNeighborsClassifier(n_neighbors=96) 

model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']
logger.info(classifiers[2].__class__.__name__)

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

2023-12-12 15:57:49,338 - training - INFO - KNeighborsClassifier
2023-12-12 15:58:05,146 - training - INFO - dataset doc F1分数: 0.8313328358840915
2023-12-12 15:58:24,401 - training - INFO - dataset sent F1分数: 0.8110983500629481


#### 4.3、决策树

In [None]:
parameters = {
    'max_depth': np.arange(10, 27, 2),
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 5),
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(classifiers[3], parameters, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)
logger.info(classifiers[3].__class__.__name__)
logger.info("最佳参数: %s", grid_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", grid_search.best_score_)

In [None]:
## 下面参数过拟合了，在OOV上还没有默认参数效果好
model = DecisionTreeClassifier(criterion='entropy', max_depth=26, min_samples_leaf=3, min_samples_split=9)
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

#### 4.4、SVM

##### 搜索kernel

In [None]:
parameters = {
    # 'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    # 'gamma': [0.01, 0.1, 1, 10, 100]
}
grid_search = GridSearchCV(classifiers[4], parameters, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)
logger.info(classifiers[4].__class__.__name__)
logger.info("最佳参数: %s", grid_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", grid_search.best_score_)

In [4]:
model = SVC(kernel='linear', C=1)
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

# eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
# eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

2023-12-12 16:05:04,015 - training - INFO - dataset sent F1分数: 0.7892086042699143


##### 搜索正则

In [None]:
param_distributions = {
    'C': reciprocal(0.1, 500),
}

random_search = RandomizedSearchCV(classifiers[5], param_distributions, n_iter=30, verbose=2, cv=5, scoring=scorer, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
logger.info(classifiers[5].__class__.__name__)
logger.info("最佳参数: %s", random_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", random_search.best_score_)

In [None]:
model = SVC(kernel='linear', )
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

#### 4.5、RandomForest

In [None]:
param_distributions = {
    'n_estimators': np.arange(100, 401, 50),
    'max_depth': np.arange(15, 40, 2),
    'min_samples_split': np.arange(2, 4),
    'min_samples_leaf': np.arange(1, 4)
}

random_search = RandomizedSearchCV(classifiers[5], param_distributions, n_iter=100, verbose=2, cv=5, scoring=scorer, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
logger.info(classifiers[5].__class__.__name__)
logger.info("最佳参数: %s", random_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", random_search.best_score_)

In [None]:
## 模型过拟合，没有默认参数好
model = RandomForestClassifier(n_estimators=500, min_samples_split=3, min_samples_leaf=1, max_depth=30)
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

#### 4.6、GBDT

In [None]:
param_distributions = {
    'n_estimators': np.arange(100, 301, 50),
    'max_depth': np.arange(6, 15),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'min_samples_split': np.arange(2, 5),
    'min_samples_leaf': np.arange(1, 4),
    'subsample': np.linspace(0.6, 1, 5)
}

random_search = RandomizedSearchCV(classifiers[6], param_distributions, n_iter=100, verbose=2, cv=5, scoring=scorer, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
logger.info(classifiers[6].__class__.__name__)
logger.info("最佳参数: %s", random_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", random_search.best_score_)


In [None]:
## 模型过拟合，没有默认参数好
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

#### 4.7、XGBoost

In [None]:
param_distributions = {
    'n_estimators': np.arange(50, 400, 50),
    'learning_rate': np.linspace(0.001, 0.1, 10),
    'max_depth': np.arange(3, 10),
    'colsample_bytree': np.linspace(0.5, 1, 5),
    'subsample': np.linspace(0.6, 1, 5),
    'booster': ['gbtree', 'gblinear', 'dart'],
    'tree_method': ['auto', 'exact', 'approx', 'hist']
}

random_search = RandomizedSearchCV(XGBClassifier(), param_distributions, n_iter=100, verbose=2, cv=5, scoring=scorer, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
logger.info(classifiers[7].__class__.__name__)
logger.info("最佳参数: %s", random_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", random_search.best_score_)

In [None]:
import xgboost as xgb

model = XGBClassifier()
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

#### 4.8、LightGBM

In [None]:
param_distributions = {
    'n_estimators': np.arange(50, 400, 50),
    'learning_rate': np.linspace(0.001, 0.1, 10),
    'max_depth': np.arange(3, 10),
    'colsample_bytree': np.linspace(0.5, 1, 5),
    'subsample': np.linspace(0.6, 1, 5),
    'boosting_type': ['gbtree', 'rf', 'dart', 'goss'],
    'tree_method': ['auto', 'exact', 'approx', 'hist']
}

random_search = RandomizedSearchCV(LGBMClassifier(), param_distributions, n_iter=4, verbose=2, cv=5, scoring=scorer, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
logger.info(classifiers[8].__class__.__name__)
logger.info("最佳参数: %s", random_search.best_params_)
logger.info("最佳模型分数（F1分数）: %s", random_search.best_score_)

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])

#### 4.9、CatBoost

In [None]:
model = CatBoostClassifier(n_estimators=400)
model.fit(X_train, y_train)
eval_name = ['doc', 'sent', 'mix']

def eval(X_test, y_test, eval_name):
    y_pred = model.predict(X_test)
    y_pred = [1 if x > 0.5 else 0 for x in y_pred]
    report = classification_report(y_test, y_pred, output_dict=True)
    logger.info("dataset {} F1分数: {}".format(eval_name, report['weighted avg']['f1-score']))

eval(X_test=X_test, y_test=y_test, eval_name=eval_name[0])
eval(X_test=X_sent_test, y_test=y_sent_test, eval_name=eval_name[1])
eval(X_test=X_concat_test, y_test=y_concat_test, eval_name=eval_name[2])