# 导入库

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
## 设置字符集，防止中文乱码
plt.rcParams['font.sans-serif']=['simHei']
plt.rcParams['axes.unicode_minus']=False

In [3]:
# jupyter展示图片，非内嵌显示; 这段代码不属于python代码； 相当于是jupyter的工具参数设置
# tk: 显示出来，inline：内嵌显示，默认为inline
%matplotlib inline

# 导入数据

In [4]:
dataset = pd.read_csv('./datas/data.csv',encoding='utf8')
dataset.head(1)

Unnamed: 0,label,from_163,from_tsinghua,from_126,from_yahoo,from_12,from_21,from_tom,from_cernet,from_sohu,...,20,21,22,23,24,25,26,27,28,29
0,1,0,0,0,0,0,0,0,0,0,...,-0.011931,0.040437,-0.004632,0.01383,0.051182,-0.002499,0.014599,-0.024562,0.017097,-0.102966


# 模型选择

## 划分数据集

In [5]:
data = dataset.drop(columns=['label'])
ydata = dataset['label']

In [6]:
train_x,test_x,train_y,test_y = train_test_split(data, ydata, test_size=0.2, random_state=10)

In [7]:
print(train_x.shape,test_x.shape)
print(train_y.shape,test_y.shape)

(51696, 42) (12924, 42)
(51696,) (12924,)


## 导入库

In [8]:
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix # 精确率、召回率、f1、混淆矩阵

## KNN模型

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
%%time
knn = KNeighborsClassifier()
model = knn.fit(train_x,train_y)

Wall time: 1.33 s


In [11]:
y_hat = model.predict(test_x)

In [12]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))

精确率：0.9772
召回率：0.9866
f1值：0.9819


## LR模型

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
%%time
lr = LogisticRegression(solver='liblinear')
model = lr.fit(train_x,train_y)

Wall time: 993 ms


In [15]:
y_hat = model.predict(test_x)

In [16]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))

精确率：0.9721
召回率：0.9848
f1值：0.9784


## bytes模型

In [17]:
from sklearn.naive_bayes import BernoulliNB

In [18]:
%%time
nb = BernoulliNB()
model = nb.fit(train_x,train_y)

Wall time: 93.8 ms


In [19]:
y_hat = model.predict(test_x)

In [20]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))

精确率：0.9555
召回率：0.9871
f1值：0.9710


## SVM模型

In [21]:
from sklearn.svm import SVC

In [22]:
%%time
svc = SVC(gamma='scale')
model = svc.fit(train_x,train_y)

Wall time: 51.9 s


In [23]:
y_hat = model.predict(test_x)

In [24]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))

精确率：0.9603
召回率：0.9896
f1值：0.9747


## gbdt模型

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

In [26]:
%%time
gbdt = GradientBoostingClassifier()
gbdt.fit(train_x,train_y)

Wall time: 13.8 s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [27]:
y_hat = gbdt.predict(test_x)

In [28]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))

精确率：0.9835
召回率：0.9923
f1值：0.9879


In [29]:
print('混淆矩阵：\n',confusion_matrix(test_y,y_hat))

混淆矩阵：
 [[4209  143]
 [  66 8506]]


# 模型调参

# 模型保存

In [30]:
import joblib
import os

In [31]:
if not os.path.exists('./model'):
    os.mkdir('./model')
    
joblib.dump(gbdt,'./model/gbdt.pkl')

['./model/gbdt.pkl']