In [1]:
!pip -q install matplotlib

In [2]:
import sys
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

In [3]:
from time import time
from functools import partial
from optparse import OptionParser
from scipy.sparse import csr_matrix

In [4]:
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectFromModel

In [5]:
from sklearn.pipeline import Pipeline
from sklearn import metrics
# from sklearn.utils.extmath import densityfeature_names

In [6]:
%matplotlib inline

In [7]:
pd.set_option('display.max_colwidth', 512)

In [8]:
%run embedding.ipynb

Importing Jupyter notebook from tokenizer.ipynb


2021-04-06 14:03:17 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 14:03:18 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-06 14:03:18 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 14:03:18 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/sentencepiece.bpe.model HTTP/1.1" 200 0
2021-04-06 14:03:18 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 14:03:19 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/tokenizer.json HTTP/1.1" 200 0
Building model [5m[33m...[0m[0m2021-04-06 14:03:20 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 14:03:21 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-06 14:03:21 DEBUG Starting new HTTPS connection (1): huggingface.co:443
2021-04-06 14:03:22 DEBUG https://huggingface.co:443 "HEAD /xlm-roberta-base/r

(20000, 2)
(14686, 2)
34686 documents - 1.595MB (training set)
1017 documents - 0.061MB (test set)

Stored 'data_train' (DataFrame)
Stored 'X_train' (Series)
Stored 'X_test' (Series)
Stored 'y_train' (list)
Stored 'y_test' (list)
Stored 'data_train_size_mb' (float)


2021-04-06 14:03:38 DEBUG vocab size: 1


Extracting features from the training data using a sparse vectorizer
done in 2207.605160s at 0.001MB/s
n_samples: 34686, n_features: 973

Extracting features from the training data using a sparse vectorizer
done in 2215.233769s at 0.001MB/s
n_samples: 34686, n_features: 974

Extracting features from the test data using the same vectorizer
done in 75.037724s at 0.001MB/s
n_samples: 1017, n_features: 974

974
Stored 'X_train' (csr_matrix)
Stored 'X_test' (csr_matrix)
Stored 'feature_names' (ndarray)


In [9]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

In [10]:
op = OptionParser()

op.add_option("--report",
              action="store_true", dest="print_report", default=True,
              help="Print a detailed classification report.")

op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")

op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")

op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class for every classifier.")

op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")

op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")

<Option at 0x7f7856408fd0: --n_features>

In [11]:
# work-around for Jupyter notebook and IPython console

# opts, an object containing values for all of your options
#     e.g. if --file takes a single string argument, then options.file will be the filename supplied by the user, 
#     or None if the user did not supply that option
# args, the list of positional arguments leftover after parsing options

argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
opts

<Values at 0x7f7963facb90: {'print_report': True, 'select_chi2': None, 'print_cm': None, 'print_top10': None, 'use_hashing': None, 'n_features': 65536}>

In [12]:
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print(__doc__)
op.print_help()
print()

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --report              Print a detailed classification report.
  --chi2_select=SELECT_CHI2
                        Select some number of features using a chi-squared
                        test
  --confusion_matrix    Print the confusion matrix.
  --top10               Print ten most discriminative terms per class for
                        every classifier.
  --use_hashing         Use a hashing vectorizer.
  --n_features=N_FEATURES
                        n_features when using the hashing vectorizer.



## Load

In [13]:
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

%store -r feature_names

## Classify

In [14]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [15]:
def fit(clf, X_train, y_train):
    
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    return train_time

In [16]:
def predict(clf, X_test):
    
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    
    return pred, test_time

In [17]:
def accuracy(y_test, pred):
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    
    return score

In [18]:
def print_coef(clf, opts, feature_names):
    # TODO:
    
    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        # print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            
            #for i, label in enumerate(target_names):
              #  top10 = np.argsort(clf.coef_[i])[-10:]
                #print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

In [19]:
def print_report(opts, y_test, pred):
    
    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred, digits=3))

In [20]:
def print_cm(opts, y_test, pred):
    
    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

In [21]:
def get_desc(clf):
    return str(clf).split('(')[0]

In [22]:
def __benchmark(X_train, y_train, X_test, y_test, opts, feature_names, clf):
    
    train_time = fit(clf, X_train, y_train)
    pred, test_time = predict(clf, X_test)
    score = accuracy(y_test, pred)
    clf_descr = get_desc(clf)

    print_coef(clf, opts, feature_names)
    print_report(opts, y_test, pred)
    print_cm(opts, y_test, pred)
    print()
    
    return pred, clf_descr, score, train_time, test_time

In [23]:
benchmark = partial(__benchmark, X_train, y_train, X_test, y_test, opts, feature_names)

## score

In [24]:
results = []

In [25]:
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(solver='sag', tol=0.01)
train time: 0.113s
test time:  0.001s
accuracy:   0.913
dimensionality: 974

classification report:
              precision    recall  f1-score   support

          -1      0.939     0.761     0.841       305
           1      0.905     0.979     0.941       712

    accuracy                          0.913      1017
   macro avg      0.922     0.870     0.891      1017
weighted avg      0.915     0.913     0.911      1017


Perceptron
________________________________________________________________________________
Training: 
Perceptron(max_iter=50)
train time: 0.023s
test time:  0.000s
accuracy:   0.901
dimensionality: 974

classification report:
              precision    recall  f1-score   support

          -1      0.898     0.754     0.820       305
           1      0.901     0.963     0.931       712

    accuracy                    

  '"sag" solver requires many iterations to fit '


train time: 0.006s
test time:  1.348s
accuracy:   0.905
classification report:
              precision    recall  f1-score   support

          -1      0.864     0.810     0.836       305
           1      0.921     0.945     0.933       712

    accuracy                          0.905      1017
   macro avg      0.892     0.878     0.884      1017
weighted avg      0.904     0.905     0.904      1017


Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier()
train time: 12.756s
test time:  0.059s
accuracy:   0.926
classification report:
              precision    recall  f1-score   support

          -1      0.899     0.849     0.874       305
           1      0.937     0.959     0.948       712

    accuracy                          0.926      1017
   macro avg      0.918     0.904     0.911      1017
weighted avg      0.926     0.926     0.926      1017




In [26]:
for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty)))

L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(dual=False, tol=0.001)
train time: 0.072s
test time:  0.001s
accuracy:   0.916
dimensionality: 974

classification report:
              precision    recall  f1-score   support

          -1      0.926     0.784     0.849       305
           1      0.913     0.973     0.942       712

    accuracy                          0.916      1017
   macro avg      0.920     0.878     0.896      1017
weighted avg      0.917     0.916     0.914      1017


________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50)
train time: 0.028s
test time:  0.000s
accuracy:   0.921
dimensionality: 974

classification report:
              precision    recall  f1-score   support

          -1      0.945     0.784     0.857       305
           1      0.914     0.980     0.946       712

    accuracy                          0.921      1017

In [27]:
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet")))

Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50, penalty='elasticnet')
train time: 0.045s
test time:  0.001s
accuracy:   0.920
dimensionality: 974

classification report:
              precision    recall  f1-score   support

          -1      0.944     0.780     0.855       305
           1      0.912     0.980     0.945       712

    accuracy                          0.920      1017
   macro avg      0.928     0.880     0.900      1017
weighted avg      0.922     0.920     0.918      1017




In [28]:
# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid()
train time: 0.010s
test time:  0.001s
accuracy:   0.858
classification report:
              precision    recall  f1-score   support

          -1      0.840     0.652     0.734       305
           1      0.864     0.947     0.903       712

    accuracy                          0.858      1017
   macro avg      0.852     0.800     0.819      1017
weighted avg      0.857     0.858     0.853      1017




In [29]:
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01)
train time: 0.008s
test time:  0.000s
accuracy:   0.903
dimensionality: 974

classification report:
              precision    recall  f1-score   support

          -1      0.919     0.741     0.820       305
           1      0.898     0.972     0.933       712

    accuracy                          0.903      1017
   macro avg      0.908     0.856     0.877      1017
weighted avg      0.904     0.903     0.899      1017


________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01)
train time: 0.009s
test time:  0.001s
accuracy:   0.907
dimensionality: 974

classification report:
              precision    recall  f1-score   support

          -1      0.957     0.721     0.822       305
           1      0.892     0.986     0.937       712

    accuracy                          0.907      1017
   macro



In [30]:
print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))

LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, penalty='l1',
                                                     tol=0.001))),
                ('classification', LinearSVC())])
train time: 0.601s
test time:  0.001s
accuracy:   0.919
classification report:
              precision    recall  f1-score   support

          -1      0.931     0.790     0.855       305
           1      0.916     0.975     0.944       712

    accuracy                          0.919      1017
   macro avg      0.923     0.882     0.899      1017
weighted avg      0.920     0.919     0.917      1017




In [31]:
param = {'num_leaves': 2**5-1, 'objective': 'binary'}
param['metric'] = 'auc'

num_round = 1000

In [32]:
X_train.shape

(34686, 974)

In [33]:
length = np.expand_dims(data_train['content'].apply(len).to_numpy(), axis=1)
print(X_train.shape)
print(length.shape)
X_train = csr_matrix(np.concatenate((X_train.toarray(), length), axis=1))
X_train.shape

(34686, 974)
(34686, 1)


(34686, 975)

In [34]:
length = np.expand_dims(data_test['content'].apply(len).to_numpy(), axis=1)
print(X_test.shape)
print(length.shape)
X_test = csr_matrix(np.concatenate((X_test.toarray(), length), axis=1))
X_test.shape

(1017, 974)
(1017, 1)


(1017, 975)

In [35]:
train_data = lgb.Dataset(X_train, label=y_train)

In [36]:
%time bst = lgb.train(param, train_data, num_round)

[LightGBM] [Info] Number of positive: 27908, number of negative: 6778
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46052
[LightGBM] [Info] Number of data points in the train set: 34686, number of used features: 975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.804590 -> initscore=1.415231
[LightGBM] [Info] Start training from score 1.415231
CPU times: user 1min 14s, sys: 200 ms, total: 1min 14s
Wall time: 9.43 s


In [37]:
y_pred = bst.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, -1)

In [38]:
print(metrics.classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

          -1      0.916     0.856     0.885       305
           1      0.940     0.966     0.953       712

    accuracy                          0.933      1017
   macro avg      0.928     0.911     0.919      1017
weighted avg      0.933     0.933     0.932      1017



In [39]:
# bst.save_model('./model/ads-detect-1-20210125.mdl')
# bst.save_model('./model/ads-detect-1-20210312.mdl')
bst.save_model('./model/ads-detect-1-20210401.mdl')

<lightgbm.basic.Booster at 0x7f7855bfaf90>

## efficiency

## tune

In [40]:
data_test['pred'] = y_pred

In [41]:
df = data_test[['label', 'pred', 'content']]
df.loc[:, 'tokens'] = df['content'].apply(split)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [42]:
# 广告，漏召回

df1 = df[(~(df.label==df.pred)) & (df.label==-1)]
print(df1.shape)
df1

(44, 4)


Unnamed: 0,label,pred,content,tokens
41,-1,1,出号！至尊卡v8一红马，,"[[UNK], [UNK], [VIP], 一, 红, 马, ,]"
73,-1,1,Mai号,[[UNK]]
122,-1,1,兄弟来新游戏试玩不？人多**好,"[兄弟, [UNK], 游戏, [UNK], 不?, 人, 多, 好]"
244,-1,1,你打公告我打资源号位置,"[打, [UNK], 打, 资源, 号, 位置]"
246,-1,1,嗯，你让他加微信,"[嗯, ,, [UNK], 加, 微信]"
264,-1,1,不是，我说的让你们看看要不要资源,"[不是, ,, [UNK], 让, 你们, 看看, 要, 不要, 资源]"
314,-1,1,招手游代理或兼职，拥有个人代理后台，周结算，0加盟费，合作即可签合同，公司扶持，想了解的可以咨询我。,"[招, [UNK], [UNK], 或, [UNK], ,, [UNK], 个人, [UNK], [UNK], ,, [UNK], [UNK], ,, [NUM], [UNK], ,, [UNK], [UNK], [UNK], [UNK], ,, [UNK], [UNK], ,, 想, [UNK], 可以, [UNK], 。]"
317,-1,1,招手游代理兼职，无需加盟费，**后台，合同模式，公司扶持政策，想了解的可以私聊我,"[招, [UNK], [UNK], [UNK], ,, [UNK], [UNK], ,, [UNK], ,, [UNK], [UNK], [UNK], [UNK], [UNK], ,, 想, [UNK], 可以, 私, 聊我]"
323,-1,1,"开！仓！出！货 ，加！ {localization:189-393},88965","[[UNK], 货, ,, [UNK], [LOC], ,, [NUM]]"
325,-1,1,出号！一红，三的卢,"[[UNK], [UNK]]"


In [43]:
# 广告，误召回

df2 = df[(~(df.label==df.pred)) & (df.label==1)]
print(df2.shape)
df2

(24, 4)


Unnamed: 0,label,pred,content,tokens
16,1,-1,玩新区+v.x13288669123有大佬带队,"[玩, [UNK], ., [CTA], [UNK], 带队]"
36,1,-1,级匪的接龙1. 落2. 上将潘凤3. 戴渊4. 春晓5. 阿邦6. 刹那芳华7. 在家数豆豆8. 小K9. 闽星辰10. 郝星辰11. 仪星12. 真可笑13. 洋哥14. 拖吧15. 血鹰刑天16. 祥博17. 米飞18耙耙.,"[级匪, [UNK], [NUM], ., [UNK], [NUM], ., [UNK], [UNK], [NUM], ., [UNK], [NUM], ., [UNK], [NUM], ., [UNK], [NUM], ., [UNK], [UNK], [NUM], ., [UNK], 数, [UNK], [NUM], ., [UNK], [NUM], ., [UNK], [UNK], [NUM], ., [UNK], [NUM], ., [UNK], [NUM], ., 真, [UNK], [NUM], ., [UNK], 哥, [NUM], ., [UNK], [NUM], ., [UNK], [UNK], 天, [NUM], ., [UNK], [NUM], ., [UNK], [NUM], [UNK], .]"
75,1,-1,sg666,[[CTA]]
85,1,-1,sg888,[[CTA]]
128,1,-1,加微信好友。,"[加, 微信, 好友, 。]"
147,1,-1,霸服军团收人！来的私聊1979574312,"[霸服, 军团, [UNK], 来的, 私聊, [CTA]]"
197,1,-1,刷勋，兵多，可调战力，私聊,"[[UNK], ,, 兵, 多, ,, 可, 调, 战力, ,, 私聊]"
211,1,-1,能告诉我 开区啥价格 现在啥价格么,"[能, [UNK], [UNK], 啥, 价格, 现在, 啥, 价格]"
258,1,-1,战力9-12万v1想刷军功的私聊我,"[战力, [NUM], -, [NUM], 万, [VIP], 想, 刷, [UNK], 私聊我]"
282,1,-1,******zhi2017,[[CTA]]
