In [1]:
# 根据观察，LR、SVM、XGBoost效果较好。选用这三种模型训练并调参

In [1]:
import xgboost as xgb
import pickle
import pandas as pd
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings("ignore")



In [34]:
data = pickle.load(open('temp_file//data_processing.pkl','rb'))
train_data = data.drop(labels='content',axis=1)

features = train_data.drop(labels='label',axis=1)
labels = train_data['label']

# 采样

In [6]:
# 有放回采样，选8000个样本
def sampling_with_return(data): 
    seq = np.random.choice(range(len(data)), size=8000, replace=True)
    return data.iloc[seq,:]

# 分层，从数据中选取正负标签各8000个样本，其中70%用作训练，30%用作测试
def layer_sampling(data):
    data_0, data_1 = data[data['label'] == 0], data[data['label'] == 1]
    data = pd.concat((sampling_with_return(data_0), sampling_with_return(data_1)))
    features = data.drop('label',axis=1)
    labels = data['label']
    return train_test_split(features, labels, test_size=0.3)

# LR

In [6]:
C_param_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
f1_table = pd.DataFrame(columns=['c_parameter', 'f1_score'])
f1_table['c_parameter'] = C_param_range

In [7]:
train_data = data.drop(labels = 'content', axis=1)

X_train, X_test, y_train, y_test = layer_sampling(train_data)
j = 0
for i in C_param_range:
    lr = LogisticRegression(C=i)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    f1_table.iloc[j,1] = f1_score(y_test, y_pred)
    j += 1

In [8]:
f1_table

Unnamed: 0,c_parameter,f1_score
0,0.001,0.887848
1,0.01,0.942943
2,0.1,0.959037
3,1.0,0.965691
4,10.0,0.968908
5,100.0,0.969379
6,1000.0,0.97065


## 选取最优参数

In [76]:
parameters = {'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

In [77]:
model_lr = LogisticRegression(penalty='l2')
lr_cv = GridSearchCV(model_lr, param_grid=parameters, cv=3)
lr_cv.fit(X_train, y_train)
lr_best_parameters = lr_cv.best_estimator_.get_params()

In [16]:
y_pred = lr_cv.predict(X_test)
f1_score(y_test, y_pred)

0.9706498951781971

## 使用有放回采样创建多个分类器

In [80]:
lr_best_parameters

{'C': 100,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [78]:
model_lr = LogisticRegression(penalty='l2',C=lr_best_parameters['C'])
lr_estimators = {}
for i in range(3):
    X_train, X_test, y_train, y_test = layer_sampling(train_data)
    model_lr.fit(X_train, y_train)
    lr_estimators[i] = model_lr
    y_pred = lr_estimators[i].predict(X_test)
    print(f1_score(y_test, y_pred))

0.9670283806343906
0.9694007649808755
0.9717562115098747


In [84]:
pred_labels = []
for i in lr_estimators:
    pred_labels.append(lr_estimators[i].predict(features))
y_hat = list(map(lambda x:1 if x>1 else 0,sum(pred_labels)))

In [85]:
f1_score(labels, y_hat)

0.9787572301030794

# SVM

## 选取最优参数

In [48]:
model_svc = SVC()
parameters = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
X_train, X_test, y_train, y_test = layer_sampling(train_data)
svc_cv = GridSearchCV(model_svc,param_grid=parameters, scoring='f1', cv=3, n_jobs=4)
svc_cv.fit(X_train, y_train)
y_pred = svc_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.9848421052631579


In [49]:
best_parameters = svc_cv.best_estimator_.get_params()

In [50]:
best_parameters

{'C': 1000,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.001,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

## 通过有放回采样训练多个模型

In [87]:
model_svc = SVC(C=best_parameters['C'], gamma=best_parameters['gamma'])
svc_estimators = {}
for i in range(3):
    X_train, X_test, y_train, y_test = layer_sampling(train_data)
    model_svc.fit(X_train, y_train)
    svc_estimators[i] = model_svc
    y_pred = svc_estimators[i].predict(X_test)
    print(f1_score(y_test, y_pred))

0.9823751573646664
0.9812126387702818
0.9828618624819327


In [88]:
pred_labels = []
for i in svc_estimators:
    pred_labels.append(svc_estimators[i].predict(features))
y_hat = list(map(lambda x:1 if x>1 else 0,sum(pred_labels)))
print(f1_score(labels, y_hat))

0.9877510709796578


# XGBoost

## 最佳迭代次数

In [9]:
parameters = {'n_estimators': [400, 500, 600, 700, 800]}
other_params = {
    'learning_rate': 0.1,
    'n_estimators': 500,
    'max_depth': 5,
    'min_child_weight': 1,
    'seed': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'reg_alpha': 0,
    'reg_lambda': 1
}
model_xgb = xgb.XGBClassifier(**other_params)

In [10]:
X_train, X_test, y_train, y_test = layer_sampling(train_data)
xgb_cv = GridSearchCV(model_xgb, param_grid=parameters, scoring='f1', cv=3, n_jobs=4)
xgb_cv.fit(X_train, y_train)
y_pred = xgb_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.9786789297658862


In [11]:
xgb_cv.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 700,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.8}

最优迭代次数为：700

## min_child_weight和max_depth

In [12]:
parameters = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [1, 2, 3, 4, 5, 6]}
other_params = {
    'learning_rate': 0.1,
    'n_estimators': 700,
    'max_depth': 5,
    'min_child_weight': 1,
    'seed': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'reg_alpha': 0,
    'reg_lambda': 1
}
model_xgb = xgb.XGBClassifier(**other_params)

In [13]:
xgb_cv = GridSearchCV(model_xgb, param_grid=parameters, scoring='f1', cv=3, n_jobs=4)
xgb_cv.fit(X_train, y_train)
y_pred = xgb_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.9809207797594359


In [14]:
xgb_cv.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 700,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.8}

最优min_child_weight为1，max_depth为4

## gamma

In [18]:
parameters = {'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
other_params = {
    'learning_rate': 0.1,
    'n_estimators': 700,
    'max_depth': 4,
    'min_child_weight': 1,
    'seed': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'reg_alpha': 0,
    'reg_lambda': 1
}
model_xgb = xgb.XGBClassifier(**other_params)

In [19]:
xgb_cv = GridSearchCV(model_xgb, param_grid=parameters, scoring='f1', cv=3, n_jobs=4)
xgb_cv.fit(X_train, y_train)
y_pred = xgb_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.980261790982755


In [20]:
xgb_cv.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 700,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.8}

最优gamma为0.1

## subsample和colsample_bytree

In [21]:
parameters = {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
other_params = {
    'learning_rate': 0.1,
    'n_estimators': 700,
    'max_depth': 4,
    'min_child_weight': 1,
    'seed': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0,
    'reg_lambda': 1
}
model_xgb = xgb.XGBClassifier(**other_params)
xgb_cv = GridSearchCV(model_xgb, param_grid=parameters, scoring='f1', cv=3, n_jobs=4)
xgb_cv.fit(X_train, y_train)
y_pred = xgb_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.9817427385892116


In [22]:
xgb_cv.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 700,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.6}

最优subsample为0.6，colsample_bytree为0.8

## reg_alpha和reg_lambda

In [23]:
parameters = {'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}
other_params = {
    'learning_rate': 0.1,
    'n_estimators': 700,
    'max_depth': 4,
    'min_child_weight': 1,
    'seed': 0,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0,
    'reg_lambda': 1
}
model_xgb = xgb.XGBClassifier(**other_params)
xgb_cv = GridSearchCV(model_xgb, param_grid=parameters, scoring='f1', cv=3, n_jobs=4)
xgb_cv.fit(X_train, y_train)
y_pred = xgb_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.9827836548433935


In [24]:
xgb_cv.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 700,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0.1,
 'reg_lambda': 0.05,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.6}

最优reg_alpha为0.1，reg_lambda为0.05

## learning_rate

In [27]:
parameters = {'learning_rate': [0.1, 0.2, 0.3, 0.5]}
other_params = {
    'learning_rate': 0.1,
    'n_estimators': 700,
    'max_depth': 4,
    'min_child_weight': 1,
    'seed': 0,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.05
}
model_xgb = xgb.XGBClassifier(**other_params)
xgb_cv = GridSearchCV(model_xgb, param_grid=parameters, scoring='f1', cv=3, n_jobs=4)
xgb_cv.fit(X_train, y_train)
y_pred = xgb_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.9821798590965602


In [28]:
xgb_cv.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.2,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 700,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0.1,
 'reg_lambda': 0.05,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.6}

最优的learning_rate为0.2

In [29]:
other_params = {'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.2,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 700,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0.1,
 'reg_lambda': 0.05,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.6}

## 创建多个分类器

In [32]:
model_xgb = xgb.XGBClassifier(**other_params)
xgb_estimators = {}
for i in range(3):
    X_train, X_test, y_train, y_test = layer_sampling(train_data)
    model_xgb.fit(X_train, y_train)
    xgb_estimators[i] = model_xgb
    y_pred = xgb_estimators[i].predict(X_test)
    print(f1_score(y_test, y_pred))

0.9754575707154742
0.9787690328114946
0.976792374637381


In [35]:
pred_labels = []
for i in xgb_estimators:
    pred_labels.append(xgb_estimators[i].predict(features))
y_hat = list(map(lambda x:1 if x>1 else 0,sum(pred_labels)))
print(f1_score(labels, y_hat))

0.9814279445548644


# 最终模型

观察可知SVM分类器效果最好

In [36]:
model_svc = SVC()
parameters = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
X_train, X_test, y_train, y_test = layer_sampling(train_data)
svc_cv = GridSearchCV(model_svc,param_grid=parameters, cv=3)
svc_cv.fit(X_train, y_train)
y_pred = svc_cv.predict(X_test)
print(f1_score(y_test, y_pred))

0.9780839073262366


In [37]:
svc_cv.best_estimator_.get_params()

{'C': 1000,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.001,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [40]:
model_svc = SVC(C=1000, gamma=0.001)
svc_estimators = {}
for i in range(3):
    X_train, X_test, y_train, y_test = layer_sampling(train_data)
    model_svc.fit(X_train, y_train)
    svc_estimators[i] = model_svc
    y_pred = svc_estimators[i].predict(X_test)
    print(f1_score(y_test, y_pred))

0.9853372434017595
0.9838065194532071
0.9851681637768958


In [41]:
pred_labels = []
for i in svc_estimators:
    pred_labels.append(svc_estimators[i].predict(features))
y_hat = list(map(lambda x:1 if x>1 else 0,sum(pred_labels)))
print(f1_score(labels, y_hat))

0.9870525025195305


In [58]:
pickle.dump(y_hat, open('temp_file//predict_label.pkl', 'wb'))
predict_label = pickle.load(open('temp_file//predict_label.pkl', 'rb'))

In [136]:
train_data['error'] = abs(train_data['label'] - predict_label)
suspect_news = train_data[(train_data['label']==0)&(train_data['error']==1)]

以下新闻可能抄袭了新华社：

In [138]:
data.loc[suspect_news.index]

Unnamed: 0,label,content,0,1,2,3,4,5,6,7,...,40,41,42,43,44,45,46,47,48,49
138,0,今天是父亲节\r\n你的欢乐悲喜\r\n你的一点点变化\r\n都躲不过老爸的眼睛\r\n但是...,0.416698,-1.195107,2.022957,1.661896,-1.445882,1.335812,0.327305,0.853628,...,1.617664,-0.118533,-0.899242,-0.201349,1.230456,1.628274,1.179396,-0.435994,-0.011901,-0.16552
422,0,2017年6月15日，以“奥林匹克，瞬间的永恒”为主题的“2017北京奥林匹克博览会”在...,0.074884,-0.752767,0.947974,0.323615,-0.291771,1.681335,0.069319,0.695903,...,1.768988,-0.173342,1.028629,4.4e-05,1.328848,0.865434,0.853318,2.018956,1.233326,-0.003847
520,0,网易轻松一刻（公众号：qingsong_163）出品\r\n今日之声：用声音传递最有价值的新...,0.516681,-1.239842,1.982434,0.614176,-0.734965,1.25413,-0.243069,1.493086,...,1.856146,0.077371,-1.053699,0.036318,0.436944,0.756814,1.372594,0.714141,0.407368,-0.394126
522,0,吐槽不停，欢乐不止，新浪NBA神吐槽栏目继续登场！威少尬舞，网友吐槽：就这水平，赵四mv...,0.693178,-0.61536,1.314384,0.616504,-0.796433,1.178611,-0.0509,1.061597,...,1.584725,-0.429285,-0.510083,0.601283,1.273728,0.798503,1.654151,-0.082907,0.449503,0.171663
672,0,原标题：哈尔滨机动车互联网选号系统受到黑客攻击暂停运行\r\n 记者从哈尔滨市公安交通...,1.515952,-1.026377,2.329614,0.236298,-1.963753,2.274494,-1.10752,0.492177,...,2.588162,-0.258863,-0.094241,-0.085798,-0.094244,1.816471,0.831,0.703482,0.121927,-0.799001
710,0,（原标题：哈尔滨机动车互联网选号系统受到黑客攻击暂停运行）\r\n央视网消息?记者从哈尔滨市...,1.530186,-1.218463,2.466769,0.220918,-2.108215,2.367331,-1.195861,0.460887,...,2.658412,-0.177465,-0.063191,-0.00933,-0.100314,1.929026,0.732328,0.688947,0.068276,-0.801674
934,0,香港历史博物馆开箱布置展品“铜镀金双龙钮云龙纹编钟”。\r\n重点展品《崇庆皇太后万寿图》卷...,0.313116,-1.603517,0.822654,0.853443,-0.818287,0.919799,-0.279729,0.576061,...,1.300115,0.639066,-0.578598,0.691448,0.200547,0.774392,0.567106,2.717251,1.606267,-1.185139
1411,0,大家好！今天是周二啦~最近的天气都好好呀！阳光明媚的，但还是有点热！大家出门上班要注意防...,1.129159,-0.87489,1.624436,0.752461,-0.978493,1.470849,-0.098398,1.2969,...,1.48131,-0.173709,-0.52002,0.627766,0.94867,0.930126,1.612254,0.423194,0.64666,-0.240219
1473,0,中新社大连6月22日电 (记者 杨毅)世界经济论坛22日在辽宁大连宣布评选出2017年度的5...,0.859517,-0.382534,1.775565,0.782104,0.223612,1.803856,-0.895786,0.818092,...,1.657832,-1.296922,1.855019,-0.16844,1.490869,1.58156,1.154031,2.116088,0.41866,0.750889
1589,0,杨幂，总是时尚的风向标，热搜榜上飘。\r\n有的时候小妹表示，想不写她都不行啊~杨幂的脸，杨...,0.456036,-1.170252,1.81175,0.932882,-1.263144,1.671015,-0.029541,0.783142,...,1.465464,0.499034,-0.961705,-0.047656,1.438386,1.392866,1.974298,-0.788995,-0.060261,0.093081
