# Model Building and Training

## 0 预处理

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
import joblib
from sklearn import metrics
from collections import Counter

In [2]:
rf_df_train = pd.read_csv('../data/train.csv')
rf_df_test = pd.read_csv('../data/val.csv')

X_train = rf_df_train.drop(['Class'], axis=1)
y_train = rf_df_train['Class']

X_test = rf_df_test.drop(['Class'], axis=1)
y_test = rf_df_test['Class']

特征缩放

In [3]:
from sklearn.preprocessing import StandardScaler


# 创建一个函数来进行特征缩放
def Standard_Scaler(df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    return df


col_names = ['Amount']
X_train = Standard_Scaler(X_train, col_names)
X_test = Standard_Scaler(X_test, col_names)

创建5折分层交叉验证器

In [4]:
from sklearn.model_selection import StratifiedKFold

#确保每次数据划分都是相同的。
#通过创建一个KFold对象kf，并将cv=kf传递进去，而不是常用的cv=5，来实现这一点。

kf = StratifiedKFold(n_splits=5, shuffle=False)

这是交叉验证（Cross Validation）中召回率（Recall）的分数。
每个数字表示在一次交叉验证划分中模型的召回率，最后的平均值是所有折次召回率的平均分数。召回率衡量的是模型对正类样本的识别能力，分数越高，说明模型对正类（如欺诈、异常等）的检测能力越强。

## 1 Random Forest

### 1.1 基准（无过采样）

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=13)
#cross_val_score(rf, X_train, y_train, cv=kf, scoring='recall')

In [7]:
score = cross_val_score(rf, X_train, y_train, cv=kf, scoring='recall', n_jobs=-1)
print("交叉验证召回率分数为: {}".format(score))
print("平均交叉验证召回率分数为: {}".format(score.mean()))

交叉验证召回率分数为: [0.84057971 0.72463768 0.76811594 0.79710145 0.77941176]
平均交叉验证召回率分数为: 0.7819693094629155


使用 GridSearchCV 进行超参数调优

In [8]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=13)
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 10, 12]
}
grid_rf = GridSearchCV(rf, param_grid=params, cv=kf,
                       scoring='recall', n_jobs=-1).fit(X_train, y_train)

print('最佳参数:', grid_rf.best_params_)
print('最佳分数:', grid_rf.best_score_)

# 保存模型到文件
joblib.dump(grid_rf, '../models/random_forest_model.pkl')
# 加载模型
# loaded_model = joblib.load('random_forest_model.pkl')

最佳参数: {'max_depth': 12, 'n_estimators': 200}
最佳分数: 0.7907075873827791


['../models/random_forest_model.pkl']

In [9]:
y_pred = grid_rf.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score

cm = confusion_matrix(y_test, y_pred)

rf_Recall = recall_score(y_test, y_pred)
rf_Precision = precision_score(y_test, y_pred)
rf_f1 = f1_score(y_test, y_pred)
rf_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56862     1]
 [   26    73]]


In [11]:
ndf = [(rf_Recall, rf_Precision, rf_f1, rf_accuracy)]

rf_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
rf_score.insert(0, 'Random Forest with', 'No Under/Oversampling')
rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,No Under/Oversampling,0.737374,0.986486,0.843931,0.999526


### 1.2 随机重采样

#### 1.2.1 Random Oversampling随机过采样

In [12]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

from imblearn.over_sampling import RandomOverSampler

# 定义过采样策略
ros = RandomOverSampler(random_state=42)
# 拟合并应用变换
X_over, y_over = ros.fit_resample(X_train, y_train)

print('正常样本:', y_over.value_counts()[0], '/', round(y_over.value_counts()[0] / len(y_over) * 100, 2), '% 的数据集')
print('欺诈样本:', y_over.value_counts()[1], '/', round(y_over.value_counts()[1] / len(y_over) * 100, 2), '% 的数据集')

正常样本: 199020 / 50.0 % 的数据集
欺诈样本: 199020 / 50.0 % 的数据集


In [13]:
from imblearn.pipeline import Pipeline, make_pipeline

random_overs_pipeline = make_pipeline(RandomOverSampler(random_state=42),
                                      RandomForestClassifier(n_estimators=100, random_state=13))

score2 = cross_val_score(random_overs_pipeline, X_train, y_train, scoring='recall', n_jobs=-1, cv=kf)
print("交叉验证召回率分数为: {}".format(score2))
print("平均交叉验证召回率分数为: {}".format(score2.mean()))

#cross_val_score(random_overs_pipeline, X_train, y_train, scoring='recall', cv=kf)

交叉验证召回率分数为: [0.84057971 0.71014493 0.76811594 0.8115942  0.77941176]
平均交叉验证召回率分数为: 0.7819693094629157


In [14]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_over_rf = GridSearchCV(random_overs_pipeline, param_grid=new_params, cv=kf, scoring='recall', n_jobs=-1,
                            return_train_score=True)
grid_over_rf.fit(X_train, y_train)
print('最佳参数:', grid_over_rf.best_params_)
print('最佳分数:', grid_over_rf.best_score_)

# 保存模型到文件
joblib.dump(grid_over_rf, '../models/random_oversampling_rf_model.pkl')

最佳参数: {'randomforestclassifier__max_depth': 4, 'randomforestclassifier__n_estimators': 50}
最佳分数: 0.8806479113384483


['../models/random_oversampling_rf_model.pkl']

In [15]:
y_pred = grid_over_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)

In [16]:
cm = confusion_matrix(y_test, y_pred)

over_rf_Recall = recall_score(y_test, y_pred)
over_rf_Precision = precision_score(y_test, y_pred)
over_rf_f1 = f1_score(y_test, y_pred)
over_rf_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56605   258]
 [   16    83]]


In [17]:
ndf = [(over_rf_Recall, over_rf_Precision, over_rf_f1, over_rf_accuracy)]

over_rf_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
over_rf_score.insert(0, 'Random Forest with', 'Random Oversampling')
over_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,Random Oversampling,0.838384,0.243402,0.377273,0.99519


#### 1.2.2 Random Undersampling随机欠采样

In [18]:
from imblearn.under_sampling import RandomUnderSampler

# 定义欠采样策略
rus = RandomUnderSampler(random_state=42)

# 拟合并应用变换
X_under, y_under = rus.fit_resample(X_train, y_train)

print('正常样本:', y_under.value_counts()[0], '/', round(y_under.value_counts()[0] / len(y_under) * 100, 2),
      '% 的数据集')
print('欺诈样本:', y_under.value_counts()[1], '/', round(y_under.value_counts()[1] / len(y_under) * 100, 2),
      '% 的数据集')

正常样本: 344 / 50.0 % 的数据集
欺诈样本: 344 / 50.0 % 的数据集


对于欠采样后的数据集，我们只有 662 条记录，因此利用该技术并不是最好的主意。

### 1.3 SMOTE

In [19]:
from imblearn.over_sampling import SMOTE

smote_pipeline = make_pipeline(SMOTE(random_state=42),
                               RandomForestClassifier(n_estimators=100, random_state=13))
#cross_val_score(smote_pipeline, X_train, y_train, scoring='recall', cv=kf)

score3 = cross_val_score(smote_pipeline, X_train, y_train, scoring='recall', n_jobs=-1, cv=kf)
print("交叉验证召回率分数为: {}".format(score3))
print("平均交叉验证召回率分数为: {}".format(score3.mean()))

交叉验证召回率分数为: [0.86956522 0.8115942  0.84057971 0.85507246 0.79411765]
平均交叉验证召回率分数为: 0.8341858482523443


In [20]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
smote_rf = GridSearchCV(smote_pipeline, param_grid=new_params, cv=kf, scoring='recall', n_jobs=-1,
                        return_train_score=True)
smote_rf.fit(X_train, y_train)
print('最佳参数:', smote_rf.best_params_)
print('最佳分数:', smote_rf.best_score_)

# 保存模型到文件
joblib.dump(smote_rf, '../models/smote_rf_model.pkl')


最佳参数: {'randomforestclassifier__max_depth': 6, 'randomforestclassifier__n_estimators': 100}
最佳分数: 0.8777493606138107


['../models/smote_rf_model.pkl']

In [21]:
y_pred = smote_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)

混淆矩阵

In [22]:
cm = confusion_matrix(y_test, y_pred)

smote_rf_Recall = recall_score(y_test, y_pred)
smote_rf_Precision = precision_score(y_test, y_pred)
smote_rf_f1 = f1_score(y_test, y_pred)
smote_rf_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56637   226]
 [   17    82]]


In [23]:
ndf = [(smote_rf_Recall, smote_rf_Precision, smote_rf_f1, smote_rf_accuracy)]

smote_rf_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
smote_rf_score.insert(0, 'Random Forest with', 'SMOTE Oversampling')
smote_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,SMOTE Oversampling,0.828283,0.266234,0.402948,0.995734


### 1.4 Tomek Links欠采样

In [24]:
from imblearn.under_sampling import TomekLinks

# 定义欠采样方法
# tomekU = TomekLinks(sampling_strategy='auto', n_jobs=-1)
tomekU = TomekLinks()

# 拟合并应用变换
X_underT, y_underT = tomekU.fit_resample(X_train, y_train)

In [25]:
print('正常样本:', y_underT.value_counts()[0], '/', round(y_underT.value_counts()[0] / len(y_underT) * 100, 2),
      '% 的数据集')
print('欺诈样本:', y_underT.value_counts()[1], '/', round(y_underT.value_counts()[1] / len(y_underT) * 100, 2),
      '% 的数据集')

正常样本: 199011 / 99.83 % 的数据集
欺诈样本: 344 / 0.17 % 的数据集


### 1.5 结合 SMOTE 和 Tomek 链接

In [26]:
from imblearn.combine import SMOTETomek

SMOTETomek_pipeline = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
                                    RandomForestClassifier(n_estimators=100, random_state=13))
#cross_val_score(smote_pipeline, X_train, y_train, scoring='recall', cv=kf)

GridSearchCV超参数调优

In [27]:
# # SMOTETomek_rf = SMOTETomek_pipeline
# # SMOTETomek_rf.fit(X_train, y_train)
# # 构建参数网格，注意参数名前缀
# new_params = {'randomforestclassifier__' + key: params[key] for key in params}
#
# # 网格搜索调优
# SMOTETomek_rf = GridSearchCV(
#     SMOTETomek_pipeline,
#     param_grid=new_params,
#     cv=kf,
#     scoring='recall',
#     n_jobs=-1,
#     return_train_score=True,
#     verbose=2           # 显示详细进度
# )
# SMOTETomek_rf.fit(X_train, y_train)
#
# print('最佳参数:', SMOTETomek_rf.best_params_)
# print('最佳分数:', SMOTETomek_rf.best_score_)
#
# # 保存模型到文件
# joblib.dump(SMOTETomek_rf, '../models/SMOTETomek_rf_model.pkl')

In [28]:
from sklearn.experimental import enable_halving_search_cv  # 启用实验功能
from sklearn.model_selection import HalvingGridSearchCV

SH_SMOTETomek_rf = HalvingGridSearchCV(
    SMOTETomek_pipeline,
    param_grid=new_params,
    cv=kf,
    scoring='recall',
    n_jobs=-1,
    verbose=2,          # 显示进度
    factor=2            # 每轮淘汰一半的参数组合
)
SH_SMOTETomek_rf.fit(X_train, y_train)

print('最佳参数:', SH_SMOTETomek_rf.best_params_)
print('最佳分数:', SH_SMOTETomek_rf.best_score_)

# 保存模型到文件
joblib.dump(SH_SMOTETomek_rf, '../models/SMOTETomek_rf_model.pkl')

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 24920
max_resources_: 199364
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 12
n_resources: 24920
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 1
n_candidates: 6
n_resources: 49840
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 2
n_candidates: 3
n_resources: 99680
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 3
n_candidates: 2
n_resources: 199360
Fitting 5 folds for each of 2 candidates, totalling 10 fits
最佳参数: {'randomforestclassifier__max_depth': 4, 'randomforestclassifier__n_estimators': 100}
最佳分数: 0.8690537084398976


['../models/SMOTETomek_rf_model.pkl']

In [29]:
y_pred = SH_SMOTETomek_rf.predict(X_test)

In [30]:
cm = confusion_matrix(y_test, y_pred)

SMOTETomek_rf_Recall = recall_score(y_test, y_pred)
SMOTETomek_rf_Precision = precision_score(y_test, y_pred)
SMOTETomek_rf_f1 = f1_score(y_test, y_pred)
SMOTETomek_rf_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56531   332]
 [   17    82]]


In [31]:
ndf = [(SMOTETomek_rf_Recall, SMOTETomek_rf_Precision, SMOTETomek_rf_f1, SMOTETomek_rf_accuracy)]

SMOTETomek_rf_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
SMOTETomek_rf_score.insert(0, 'Random Forest with', 'SMOTE + Tomek')
SMOTETomek_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,SMOTE + Tomek,0.828283,0.198068,0.319688,0.993873


### 1.6 Class weights in the models

In [32]:
# 如果选择 class_weight = "balanced"，
# 各类别的权重会根据其在数据中出现的频率自动调整，类别越少权重越大。

rfb = RandomForestClassifier(n_estimators=100, random_state=13, class_weight="balanced")

In [33]:
score5 = cross_val_score(rfb, X_train, y_train, cv=kf, scoring='recall', n_jobs=-1)
print("交叉验证召回率分数为: {}".format(score5))
print("平均交叉验证召回率分数为: {}".format(score5.mean()))

交叉验证召回率分数为: [0.8115942  0.66666667 0.75362319 0.8115942  0.75      ]
平均交叉验证召回率分数为: 0.7586956521739131


In [34]:
grid_rfb = GridSearchCV(rfb, param_grid=params, cv=kf,
                        scoring='recall', n_jobs=-1).fit(X_train, y_train)
# 保存模型到文件
joblib.dump(grid_rfb, '../models/Classweights_rf_model.pkl')

['../models/Classweights_rf_model.pkl']

In [35]:
y_pred = grid_rfb.predict(X_test)

In [36]:
cm = confusion_matrix(y_test, y_pred)

grid_rfb_Recall = recall_score(y_test, y_pred)
grid_rfb_Precision = precision_score(y_test, y_pred)
grid_rfb_f1 = f1_score(y_test, y_pred)
grid_rfb_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56649   214]
 [   17    82]]


In [37]:
ndf = [(grid_rfb_Recall, grid_rfb_Precision, grid_rfb_f1, grid_rfb_accuracy)]

grid_rfb_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
grid_rfb_score.insert(0, 'Random Forest with', 'Class weights')
grid_rfb_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,Class weights,0.828283,0.277027,0.41519,0.995945


性能对比

In [38]:
predictions = pd.concat([rf_score, over_rf_score, smote_rf_score, SMOTETomek_rf_score, grid_rfb_score],
                        ignore_index=True, sort=False)
predictions.sort_values(by=['Recall'], ascending=False)

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
1,Random Oversampling,0.838384,0.243402,0.377273,0.99519
3,SMOTE + Tomek,0.828283,0.198068,0.319688,0.993873
2,SMOTE Oversampling,0.828283,0.266234,0.402948,0.995734
4,Class weights,0.828283,0.277027,0.41519,0.995945
0,No Under/Oversampling,0.737374,0.986486,0.843931,0.999526


## 2 XGBoost

In [33]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score

# 1. 创建XGBoost分类器
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
xgb_model = XGBClassifier(random_state=13, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
# xgb = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=13, eval_metric='logloss', scale_pos_weight=scale_pos_weight)

score_xgb_model = cross_val_score(xgb_model, X_train, y_train, cv=kf, scoring='recall', n_jobs=-1)
print("交叉验证召回率分数为: {}".format(score_xgb_model))
print("平均交叉验证召回率分数为: {}".format(score_xgb_model.mean()))

交叉验证召回率分数为: [0.8115942  0.8115942  0.82608696 0.82608696 0.79411765]
平均交叉验证召回率分数为: 0.8138959931798807


In [34]:
# 2. 训练模型
xgb_model.fit(X_train, y_train)
# 保存模型到文件
joblib.dump(xgb_model, '../models/xgb_model.pkl')

['../models/xgb_model.pkl']

In [35]:
y_pred = xgb_model.predict(X_test)

In [36]:
cm = confusion_matrix(y_test, y_pred)

xgb_recall = recall_score(y_test, y_pred)
xgb_precision = precision_score(y_test, y_pred)
xgb_f1 = f1_score(y_test, y_pred)
xgb_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56852    11]
 [   22    77]]


In [37]:
ndf = [(xgb_recall, xgb_precision, xgb_f1, xgb_accuracy)]

xgb_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
xgb_score.insert(0, 'Models', 'XGBoost')
xgb_score

Unnamed: 0,Models,Recall,Precision,F1 Score,Accuracy
0,XGBoost,0.777778,0.875,0.823529,0.999421


超参数调优

In [39]:
from sklearn.model_selection import GridSearchCV
# # 定义模型
# scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
# xgb = XGBClassifier(random_state=13, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
grid_xgb_model = XGBClassifier(random_state=13, eval_metric='logloss')

# 定义参数网格
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

# 创建GridSearchCV对象
grid_xgb = GridSearchCV(estimator=grid_xgb_model, param_grid=params, scoring='recall', cv=5, n_jobs=-1)

# 拟合模型
grid_xgb.fit(X_train, y_train)

# 输出最佳参数和分数
print('最佳参数:', grid_xgb.best_params_)
print('最佳分数:', grid_xgb.best_score_)

# 保存模型到文件
joblib.dump(grid_xgb, '../models/grid_xgb_model.pkl')

最佳参数: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200}
最佳分数: 0.8080988917306053


['../models/grid_xgb_model.pkl']

In [40]:
y_pred = grid_xgb.predict(X_test)

In [41]:
cm = confusion_matrix(y_test, y_pred)

grid_xgb_recall = recall_score(y_test, y_pred)
grid_xgb_precision = precision_score(y_test, y_pred)
grid_xgb_f1 = f1_score(y_test, y_pred)
grid_xgb_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56860     3]
 [   24    75]]


In [42]:
ndf = [(grid_xgb_recall, grid_xgb_precision, grid_xgb_f1, grid_xgb_accuracy)]

grid_xgb_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
grid_xgb_score.insert(0, 'Models', 'XGBoost_GridSearchCV')
grid_xgb_score

Unnamed: 0,Models,Recall,Precision,F1 Score,Accuracy
0,XGBoost_GridSearchCV,0.757576,0.961538,0.847458,0.999526


### SMOTE and Tomek Links

In [53]:
# 忽略FutureWarning警告
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from imblearn.over_sampling import SMOTE
# 使用SMOTE进行过采样，平衡训练集
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

from imblearn.under_sampling import TomekLinks

# 使用TomekLinks进行欠采样（调整顺序后先过采样再欠采样）
tomek = TomekLinks()
X_train_resampled, y_train_resampled = tomek.fit_resample(X_train_resampled, y_train_resampled)

In [54]:
st_xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight)  # 适应不平衡数据
st_xgb_model.fit(X_train_resampled, y_train_resampled)

score_st_xgb = cross_val_score(st_xgb_model, X_train, y_train, cv=kf, scoring='recall', n_jobs=-1)
print("交叉验证召回率分数为: {}".format(score_xgb))
print("平均交叉验证召回率分数为: {}".format(score_xgb.mean()))

交叉验证召回率分数为: [0.8115942  0.8115942  0.82608696 0.82608696 0.79411765]
平均交叉验证召回率分数为: 0.8138959931798807


In [55]:
params = {'n_estimators': [100, 200], 'max_depth': [3, 6, 9]}
st_xgb_model = GridSearchCV(xgb.XGBClassifier(), param_grid=params, cv=5)
st_xgb_model.fit(X_train_resampled, y_train_resampled)

# 输出最佳参数和分数
print('最佳参数:', st_xgb_model.best_params_)
print('最佳分数:', st_xgb_model.best_score_)

joblib.dump(st_xgb_model, '../models/st_xgb_model.pkl')

最佳参数: {'max_depth': 6, 'n_estimators': 200}
最佳分数: 0.9998718099731893


['../models/st_xgb_model.pkl']

In [56]:
y_pred = st_xgb_model.predict(X_test)

In [57]:
cm = confusion_matrix(y_test, y_pred)

st_xgb_recall = recall_score(y_test, y_pred)
st_xgb_precision = precision_score(y_test, y_pred)
st_xgb_f1 = f1_score(y_test, y_pred)
st_xgb_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56848    15]
 [   21    78]]


In [58]:
ndf = [(st_xgb_recall, st_xgb_precision, st_xgb_f1, st_xgb_accuracy)]

st_xgb_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
st_xgb_score.insert(0, 'Models', 'XGBoost_st')
st_xgb_score

Unnamed: 0,Models,Recall,Precision,F1 Score,Accuracy
0,XGBoost_st,0.787879,0.83871,0.8125,0.999368


## LightGBM

In [48]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

# 1. 创建 LightGBM 分类器
lgbm = LGBMClassifier(n_estimators=100, random_state=13)

# 2. 交叉验证
score_lgbm = cross_val_score(lgbm, X_train, y_train, cv=kf, scoring='recall', n_jobs=-1)
print("交叉验证召回率分数为: {}".format(score_lgbm))
print("平均交叉验证召回率分数为: {}".format(score_lgbm.mean()))

# 3. 拟合模型
lgbm.fit(X_train, y_train)

# 4. 保存模型
joblib.dump(lgbm, '../models/lgbm_model.pkl')

交叉验证召回率分数为: [0.60869565 0.50724638 0.52173913 0.37681159 0.5       ]
平均交叉验证召回率分数为: 0.5028985507246377
[LightGBM] [Info] Number of positive: 344, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 199364, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001725 -> initscore=-6.360519
[LightGBM] [Info] Start training from score -6.360519


['../models/lgbm_model.pkl']

In [49]:
y_pred = lgbm.predict(X_test)

In [50]:
cm = confusion_matrix(y_test, y_pred)

lgbm_recall = recall_score(y_test, y_pred)
lgbm_precision = precision_score(y_test, y_pred)
lgbm_f1 = f1_score(y_test, y_pred)
lgbm_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56667   196]
 [   57    42]]


In [51]:
ndf = [(lgbm_recall, lgbm_precision, lgbm_f1, lgbm_accuracy)]

lgbm_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
lgbm_score.insert(0, 'Models', 'LightGBM')
lgbm_score

Unnamed: 0,Models,Recall,Precision,F1 Score,Accuracy
0,LightGBM,0.424242,0.176471,0.249258,0.995558


In [52]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

lgbm = LGBMClassifier(random_state=13)
grid_lgbm = GridSearchCV(lgbm, param_grid=params, cv=kf, scoring='recall', n_jobs=-1)
grid_lgbm.fit(X_train, y_train)

print('最佳参数:', grid_lgbm.best_params_)
print('最佳分数:', grid_lgbm.best_score_)

# 保存模型
joblib.dump(grid_lgbm, '../models/grid_lgbm_model.pkl')

[LightGBM] [Info] Number of positive: 344, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 199364, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001725 -> initscore=-6.360519
[LightGBM] [Info] Start training from score -6.360519
最佳参数: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 200}
最佳分数: 0.764535379369139


['../models/grid_lgbm_model.pkl']

In [53]:
y_pred= grid_lgbm.predict(X_test)

In [54]:
cm = confusion_matrix(y_test, y_pred)

grid_lgbm_recall = recall_score(y_test, y_pred)
grid_lgbm_precision = precision_score(y_test, y_pred)
grid_lgbm_f1 = f1_score(y_test, y_pred)
grid_lgbm_accuracy = accuracy_score(y_test, y_pred)

print(cm)

[[56854     9]
 [   28    71]]


In [55]:
ndf = [(grid_lgbm_recall, grid_lgbm_precision, grid_lgbm_f1, grid_lgbm_accuracy)]

grid_lgbm_score = pd.DataFrame(data=ndf, columns=['Recall', 'Precision', 'F1 Score', 'Accuracy'])
grid_lgbm_score.insert(0, 'Models', 'LightGBM_GridSearchCV')
grid_lgbm_score

Unnamed: 0,Models,Recall,Precision,F1 Score,Accuracy
0,LightGBM_GridSearchCV,0.717172,0.8875,0.793296,0.99935
