In [1]:
!pip -q install matplotlib

In [2]:
import sys
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt

In [3]:
from time import time
from functools import partial
from optparse import OptionParser
from scipy.sparse import csr_matrix

In [4]:
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectFromModel

In [5]:
from sklearn.pipeline import Pipeline
from sklearn import metrics
# from sklearn.utils.extmath import densityfeature_names

In [6]:
%matplotlib inline

In [7]:
%run embedding.ipynb

Importing Jupyter notebook from tokenizer.ipynb
(20000, 2)
20000 documents - 0.837MB (training set)
1017 documents - 0.061MB (test set)

Stored 'data_train' (DataFrame)
Stored 'X_train' (Series)
Stored 'X_test' (Series)
Stored 'y_train' (list)
Stored 'y_test' (list)
Stored 'data_train_size_mb' (float)
Extracting features from the training data using a sparse vectorizer
done in 1.714275s at 0.488MB/s
n_samples: 20000, n_features: 980

Extracting features from the training data using a sparse vectorizer
done in 3.125427s at 0.268MB/s
n_samples: 20000, n_features: 981

Extracting features from the test data using the same vectorizer
done in 0.196661s at 0.311MB/s
n_samples: 1017, n_features: 981

Stored 'X_train' (csr_matrix)
Stored 'X_test' (csr_matrix)
Stored 'feature_names' (ndarray)


In [8]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

In [9]:
op = OptionParser()

op.add_option("--report",
              action="store_true", dest="print_report", default=True,
              help="Print a detailed classification report.")

op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")

op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")

op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class for every classifier.")

op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")

op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")

<Option at 0x7f5d9f52de90: --n_features>

In [10]:
# work-around for Jupyter notebook and IPython console

# opts, an object containing values for all of your options
#     e.g. if --file takes a single string argument, then options.file will be the filename supplied by the user, 
#     or None if the user did not supply that option
# args, the list of positional arguments leftover after parsing options

argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
opts

<Values at 0x7f5d9f582450: {'print_report': True, 'select_chi2': None, 'print_cm': None, 'print_top10': None, 'use_hashing': None, 'n_features': 65536}>

In [11]:
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print(__doc__)
op.print_help()
print()

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --report              Print a detailed classification report.
  --chi2_select=SELECT_CHI2
                        Select some number of features using a chi-squared
                        test
  --confusion_matrix    Print the confusion matrix.
  --top10               Print ten most discriminative terms per class for
                        every classifier.
  --use_hashing         Use a hashing vectorizer.
  --n_features=N_FEATURES
                        n_features when using the hashing vectorizer.



## Load

In [12]:
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

%store -r feature_names

## Classify

In [13]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [14]:
def fit(clf, X_train, y_train):
    
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    return train_time

In [15]:
def predict(clf, X_test):
    
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    
    return pred, test_time

In [16]:
def accuracy(y_test, pred):
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    
    return score

In [17]:
def print_coef(clf, opts, feature_names):
    # TODO:
    
    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        # print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            
            #for i, label in enumerate(target_names):
              #  top10 = np.argsort(clf.coef_[i])[-10:]
                #print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

In [18]:
def print_report(opts, y_test, pred):
    
    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred))

In [19]:
def print_cm(opts, y_test, pred):
    
    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

In [20]:
def get_desc(clf):
    return str(clf).split('(')[0]

In [21]:
def __benchmark(X_train, y_train, X_test, y_test, opts, feature_names, clf):
    
    train_time = fit(clf, X_train, y_train)
    pred, test_time = predict(clf, X_test)
    score = accuracy(y_test, pred)
    clf_descr = get_desc(clf)

    print_coef(clf, opts, feature_names)
    print_report(opts, y_test, pred)
    print_cm(opts, y_test, pred)
    print()
    
    return pred, clf_descr, score, train_time, test_time

In [22]:
benchmark = partial(__benchmark, X_train, y_train, X_test, y_test, opts, feature_names)

## score

In [23]:
results = []

In [24]:
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(solver='sag', tol=0.01)
train time: 0.080s
test time:  0.001s
accuracy:   0.925
dimensionality: 981

classification report:
              precision    recall  f1-score   support

          -1       0.95      0.80      0.87       313
           1       0.92      0.98      0.95       704

    accuracy                           0.93      1017
   macro avg       0.93      0.89      0.91      1017
weighted avg       0.93      0.93      0.92      1017


Perceptron
________________________________________________________________________________
Training: 
Perceptron(max_iter=50)
train time: 0.017s
test time:  0.001s
accuracy:   0.918
dimensionality: 981

classification report:
              precision    recall  f1-score   support

          -1       0.86      0.88      0.87       313
           1       0.95      0.94      0.94       704

    accuracy                    

  '"sag" solver requires many iterations to fit '


test time:  0.696s
accuracy:   0.899
classification report:
              precision    recall  f1-score   support

          -1       0.89      0.77      0.82       313
           1       0.90      0.96      0.93       704

    accuracy                           0.90      1017
   macro avg       0.89      0.86      0.88      1017
weighted avg       0.90      0.90      0.90      1017


Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier()
train time: 6.230s
test time:  0.046s
accuracy:   0.944
classification report:
              precision    recall  f1-score   support

          -1       0.94      0.87      0.91       313
           1       0.94      0.98      0.96       704

    accuracy                           0.94      1017
   macro avg       0.94      0.92      0.93      1017
weighted avg       0.94      0.94      0.94      1017




In [25]:
for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty)))

L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(dual=False, tol=0.001)
train time: 0.039s
test time:  0.001s
accuracy:   0.945
dimensionality: 981

classification report:
              precision    recall  f1-score   support

          -1       0.94      0.87      0.91       313
           1       0.95      0.98      0.96       704

    accuracy                           0.94      1017
   macro avg       0.94      0.92      0.93      1017
weighted avg       0.94      0.94      0.94      1017


________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50)
train time: 0.020s
test time:  0.001s
accuracy:   0.941
dimensionality: 981

classification report:
              precision    recall  f1-score   support

          -1       0.96      0.84      0.90       313
           1       0.93      0.98      0.96       704

    accuracy                           0.94      1017

In [26]:
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet")))

Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50, penalty='elasticnet')
train time: 0.040s
test time:  0.001s
accuracy:   0.939
dimensionality: 981

classification report:
              precision    recall  f1-score   support

          -1       0.96      0.84      0.89       313
           1       0.93      0.98      0.96       704

    accuracy                           0.94      1017
   macro avg       0.94      0.91      0.93      1017
weighted avg       0.94      0.94      0.94      1017




In [27]:
# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid()
train time: 0.008s
test time:  0.001s
accuracy:   0.857
classification report:
              precision    recall  f1-score   support

          -1       0.79      0.73      0.76       313
           1       0.89      0.91      0.90       704

    accuracy                           0.86      1017
   macro avg       0.84      0.82      0.83      1017
weighted avg       0.86      0.86      0.86      1017




In [28]:
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01)
train time: 0.010s
test time:  0.001s
accuracy:   0.864
dimensionality: 981

classification report:
              precision    recall  f1-score   support

          -1       0.85      0.68      0.76       313
           1       0.87      0.94      0.91       704

    accuracy                           0.86      1017
   macro avg       0.86      0.81      0.83      1017
weighted avg       0.86      0.86      0.86      1017


________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01)
train time: 0.010s
test time:  0.001s
accuracy:   0.869
dimensionality: 981

classification report:
              precision    recall  f1-score   support

          -1       0.85      0.69      0.77       313
           1       0.87      0.95      0.91       704

    accuracy                           0.87      1017
   macro



In [29]:
print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))

LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, penalty='l1',
                                                     tol=0.001))),
                ('classification', LinearSVC())])
train time: 0.379s
test time:  0.001s
accuracy:   0.948
classification report:
              precision    recall  f1-score   support

          -1       0.95      0.88      0.91       313
           1       0.95      0.98      0.96       704

    accuracy                           0.95      1017
   macro avg       0.95      0.93      0.94      1017
weighted avg       0.95      0.95      0.95      1017




In [30]:
param = {'num_leaves': 2**5-1, 'objective': 'binary'}
param['metric'] = 'auc'

num_round = 1000

In [31]:
X_train.shape

(20000, 981)

In [32]:
length = np.expand_dims(data_train['content'].apply(len).to_numpy(), axis=1)
print(X_train.shape)
print(length.shape)
X_train = csr_matrix(np.concatenate((X_train.toarray(), length), axis=1))
X_train.shape

(20000, 981)
(20000, 1)


(20000, 982)

In [33]:
length = np.expand_dims(data_test['content'].apply(len).to_numpy(), axis=1)
print(X_test.shape)
print(length.shape)
X_test = csr_matrix(np.concatenate((X_test.toarray(), length), axis=1))
X_test.shape

(1017, 981)
(1017, 1)


(1017, 982)

In [34]:
train_data = lgb.Dataset(X_train, label=y_train)
bst = lgb.train(param, train_data, num_round)

y_pred = bst.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, -1)

[LightGBM] [Info] Number of positive: 16100, number of negative: 3900
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50997
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 982
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.805000 -> initscore=1.417843
[LightGBM] [Info] Start training from score 1.417843


In [35]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.97      0.89      0.93       313
           1       0.95      0.99      0.97       704

    accuracy                           0.96      1017
   macro avg       0.96      0.94      0.95      1017
weighted avg       0.96      0.96      0.96      1017



In [36]:
bst.save_model('./model/ads-detect-1-20200125.mdl')

<lightgbm.basic.Booster at 0x7f5d9d4fe050>

## efficiency

## tune

In [37]:
data_test['pred'] = y_pred

In [38]:
df = data_test[['label', 'pred', 'content']]
df['tokens'] = df['content'].apply(split2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [39]:
df1 = df[(~(df.label==df.pred)) & (df.label==-1)]
print(df1.shape)
df1

(33, 4)


Unnamed: 0,label,pred,content,tokens
61,-1,1,收个功勋号！,"[收, 个, 功, 勋, 号, <special-char>]"
122,-1,1,兄弟来新游戏试玩不？人多**好,"[兄, 弟, 来, 新, 游, 戏, 试, 玩, 不, 人, 多, 好, <special-..."
128,-1,1,加微信好友。,"[+, 微, 好, 友, <special-char>]"
237,-1,1,嫁一下**徽信xsw767拉你进群，领取礼包大家一起玩,"[+, 微, <contact>, 拉, 进, 群, ,, 领, 取, 礼, 包, 大, 家..."
241,-1,1,你那还有礼包码吗,"[还, 有, 礼, 包, 码]"
244,-1,1,你打公告我打资源号位置,"[打, 公, 告, 打, 资, 源, 号, 位, 置]"
250,-1,1,我下装备你过来，不行再下将,"[下, 装, 备, 过, 来, ,, 不, 行, 再, 下, 将]"
258,-1,1,战力9-12万v1想刷军功的私聊我,"[战, 力, <num-3>, 万, 微, <contact>, 想, 刷, 军, 功, 私..."
264,-1,1,不是，我说的让你们看看要不要资源,"[不, 是, ,, 说, 让, 看, 看, 要, 不, 要, 资, 源]"
314,-1,1,招手游代理或兼职，拥有个人代理后台，周结算，0加盟费，合作即可签合同，公司扶持，想了解的可以...,"[招, 手, 游, 代, 理, 或, <unk>, <unk>, ,, <unk>, 有, ..."


In [40]:
df2 = df[(~(df.label==df.pred)) & (df.label==1)]
print(df2.shape)
df2

(8, 4)


Unnamed: 0,label,pred,content,tokens
99,1,-1,霸服军团收吴，魏，蜀国活人，来的加 1979574312私聊,"[霸, 服, <corpus>, 收, 吴, ,, 魏, ,, 蜀, 国, 人, ,, 来,..."
100,1,-1,新区195收人。，最强吴国军团。加weixin=13288669123,"[新, 区, <num-3>, <recruit>, ,, 最, 强, 吴, 国, <cor..."
211,1,-1,能告诉我 开区啥价格 现在啥价格么,"[能, 告, 诉, 开, 区, 啥, 价, 格, 现, 在, 啥, 价, 格]"
454,1,-1,军团要整顿，加我微信，我拉你进核心群：571773352,"[<corpus>, 要, 整, <unk>, ,, +, 微, ,, 拉, 进, <unk..."
513,1,-1,兄弟在吗？我是灰长小号，十下我徽KLDY0855，进交流裙，没十的一会当机器人T了,"[兄, 弟, 在, 是, <unk>, 长, 小, 号, ,, +, 微, <contact..."
519,1,-1,没有微信号了，你那有吗,"[有, 微, 号, ,, 有]"
689,1,-1,酒馆收活人，来的私聊,"[酒, <unk>, <recruit>, ,, 来, 私, 聊]"
848,1,-1,资源号,"[资, 源, 号]"
