In [10]:
import xgboost as xgb
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import warnings

from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore")

# 抽样

In [2]:
# 有放回采样，选8000个样本
def sampling_with_return(data): 
    seq = np.random.choice(range(len(data)), size=5000, replace=True)
    return data.iloc[seq,:]

# 分层，从数据中选取正负标签各8000个样本，其中70%用作训练，30%用作测试
def layer_sampling(data):
    data_0, data_1 = data[data['label'] == 0], data[data['label'] == 1]
    data = pd.concat((sampling_with_return(data_0), sampling_with_return(data_1)))
    features = data.drop('label',axis=1)
    labels = data['label']
    return train_test_split(features, labels, test_size=0.3)

# 训练模型、进行评估

In [12]:
def model_metrics(clf, data, n=10):
    accuracy = 0
    f1 = 0
    auc = 0
    for i in range(n):
        X_train, X_test, y_train, y_test =  layer_sampling(data)
        model = clf.fit(X_train, y_train)
        y_hat = model.predict(X_test)
        accuracy += 1 - sum(abs(y_test-y_hat))/len(y_test)
        f1 += f1_score(y_test, y_hat)
        auc += roc_auc_score(y_test, y_hat)
    return {'accuracy':accuracy/n, 'f1-score':f1/n, 'auc':auc/n}

In [3]:
model_LR = LogisticRegression()
model_SVC = LinearSVC()
model_NB = GaussianNB()
model_KNN = KNeighborsClassifier()
model_XGB = xgb.XGBClassifier()
clfs = {'model_LR':model_LR,'model_SVC': model_SVC,'model_NB' :model_NB, 'model_KNN':model_KNN, 'model_XGB':model_XGB}

In [8]:
data = pickle.load(open( 'temp_file//data_processing.pkl','rb'))
train_data = data.drop(labels='content',axis=1)

In [13]:
for clf in clfs:
    print('{}:{}'.format(clf, model_metrics(clfs[clf] ,train_data)))

model_LR:{'accuracy': 0.9667000000000001, 'f1-score': 0.9660815651856387, 'auc': 0.9666126968717157}
model_SVC:{'accuracy': 0.9667666666666669, 'f1-score': 0.9664617642109746, 'auc': 0.9668393291348242}
model_NB:{'accuracy': 0.8293333333333333, 'f1-score': 0.8105969651705417, 'auc': 0.8295971367318904}
model_KNN:{'accuracy': 0.9349000000000002, 'f1-score': 0.9328347023309451, 'auc': 0.9348996315903808}
model_XGB:{'accuracy': 0.9492333333333333, 'f1-score': 0.9471691976384763, 'auc': 0.9488688165584991}
