In [1]:
import re
import sys
import jieba
import pickle
import logging
import unicodedata

import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from time import time
from functools import partial
from itertools import chain
from itertools import zip_longest
from scipy.sparse import csr_matrix

from sklearn.datasets import fetch_20newsgroups

from optparse import OptionParser
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.utils.extmath import density

In [2]:
%matplotlib inline

In [3]:
pd.set_option('max_rows', 10**5)
pd.set_option('display.max_colwidth', 10**5)

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

In [None]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

In [6]:
op = OptionParser()

op.add_option("--report",
              action="store_true", dest="print_report", default=True,
              help="Print a detailed classification report.")

op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")

op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")

op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class for every classifier.")

op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")

op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")

<Option at 0x7f584f6843d0: --n_features>

In [7]:
# work-around for Jupyter notebook and IPython console

# opts, an object containing values for all of your options
#     e.g. if --file takes a single string argument, then options.file will be the filename supplied by the user, 
#     or None if the user did not supply that option
# args, the list of positional arguments leftover after parsing options

argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
opts

<Values at 0x7f587c602670: {'print_report': True, 'select_chi2': None, 'print_cm': None, 'print_top10': None, 'use_hashing': None, 'n_features': 65536}>

In [8]:
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print(__doc__)
op.print_help()
print()

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --report              Print a detailed classification report.
  --chi2_select=SELECT_CHI2
                        Select some number of features using a chi-squared
                        test
  --confusion_matrix    Print the confusion matrix.
  --top10               Print ten most discriminative terms per class for
                        every classifier.
  --use_hashing         Use a hashing vectorizer.
  --n_features=N_FEATURES
                        n_features when using the hashing vectorizer.



# Data

## load

In [9]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

In [10]:
data_train = pd.read_excel('/home/wangyh/project/document_cluster/data/dataset_ads-20210113-1-labeled.xlsx')
data_train = data_train[['label', 'content']]
data_train = data_train.dropna()

print(data_train.shape)
data_train.head(3)

2021-01-25 11:18:49,283 INFO NumExpr defaulting to 8 threads.


(20000, 2)


Unnamed: 0,label,content
0,1.0,岀赀縁 masonghe86
1,11.0,九，勾引团里有夫之妇，满脸zhi疮
2,1.0,53=10w元宝+VIP13+武将+各资源1亿+Q348857676


In [11]:
data_test = pd.read_excel('/home/wangyh/project/document_cluster/data/dataset_ads-20210120-1-labeled.xlsx')
data_test = data_test[['label', 'content']]
data_test.head(3)

Unnamed: 0,label,content
0,2,为了军团更好地发展，资源更好地利用，大家一起作战，兄弟加微信：svip12126，组织打城，军团群里有攻略和内测老人带队，就差你一个人没加了，没加的当机器人踢了
1,2,无兄弟，不三国195新区即将来至大佬带队，要去的可私聊我霸服了团长送老婆十对
2,1,全/脫/小/妹/针.人视频1对1/Q Q：2332287921


In [12]:
data_train.loc[data_train.label.isin([1, 9, 24]), 'label'] = -1   # ads
data_train.loc[~(data_train.label==-1), 'label'] = 1

data_train.label.value_counts(normalize=True)

 1.0    0.805
-1.0    0.195
Name: label, dtype: float64

In [13]:
data_test.loc[data_test.label.isin([1, 9, 24]), 'label'] = -1
data_test.loc[~(data_test.label==-1), 'label'] = 1
data_test.label.value_counts(normalize=True)

 1    0.692232
-1    0.307768
Name: label, dtype: float64

In [14]:
X_train, X_test = data_train.content, data_test.content
y_train, y_test = data_train.label.tolist(), data_test.label.tolist()

In [15]:
data_train_size_mb = size_mb(X_train)
data_test_size_mb = size_mb(X_test)

print("%d documents - %0.3fMB (training set)" % (len(X_train), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (len(X_test), data_test_size_mb))
print()

20000 documents - 0.837MB (training set)
1017 documents - 0.061MB (test set)



## preprocess

In [16]:
def is_special(i):
    
    is_1 =  i in ['<unk>', '<loc>', '<contact>', '<recruit>', '<corpus>', '<colonel>']
    is_2 = bool(re.match(r'^<num-[0-9]+>$', i))
    
    return is_1 or is_2

In [17]:
def not_special(i):
    return not is_special(i)

### replace

In [18]:
def __replace(s):
    if is_special(s): return s
    
    # 微信
    s = s.replace('威', '微')
    s = s.replace('徽', '微')
    s = s.replace('徵', '微')
    s = s.replace('亻言', '信')
    
    s = s.replace('微新', '微信')
    s = s.replace('微信', '微')
    
    # 加
    s = s.replace('咖', '加')
    s = s.replace('架', '加')
    s = s.replace('嫁', '加')
    s = s.replace('十', '加')
    s = s.replace('茄', '加')
    s = s.replace('迦', '加')
    
    s = s.replace('加下', '加')
    s = s.replace('加一下', '加')
    
    s = s.replace('加', '+')
    
    # 收人
    s = s.replace('活人', '人')
    # s = s.replace('收人', '<recruit>')
    
    # 团长
    s = s.replace('圕', '团')
    
    # 充
    s = s.replace('冲', '充')
    s = s.replace('直充', '充')
    
    # 出
    s = s.replace('础', '出')

    # 卖
    s = s.replace('麦', '出')
    
    return s

In [19]:
def replace(x):
    return [__replace(i) for i in x]

### split util

In [20]:
def split_regex(s, reg, flag):
    r = re.split(reg, s)
    if 1 >= len(r): return r
    r = list(chain.from_iterable(zip_longest(r[:-1], [], fillvalue=flag))) + r[-1:]
    return [i for i in r if i]

#### split location

In [21]:
def split_location(s):
    return split_regex(s, r'{localization:[0-9]+\-[0-9]+}', '<loc>')

#### split terminology

In [22]:
def __split_terminology(s, term, flag):
    if is_special(s): return [s]
    return split_regex(s, r'%s' % term, flag)

In [23]:
def split_terminology(x, term, flag):
    return list(chain.from_iterable([__split_terminology(i, term, flag) for i in x]))

#### split coordinates

In [24]:
# TODO:

#### split num + char

In [25]:
def __split_charnum(s):
    if is_special(s): return [s]
    return [c for c in re.split(r'([0-9a-z]+)', s) if c]

In [26]:
def split_charnum(x):
    return list(chain.from_iterable([__split_charnum(s) for s in x]))

#### convert num

In [27]:
def is_numeric(s):
    
    has_num = bool(re.findall(r'[0-9]+', s))
    hasnot_other = not bool(re.findall(r'[^0-9]+', s))
    
    return has_num and hasnot_other

In [28]:
def convert_num(x):
    """ tool
    """
    
    # return ['<num-%s>' % len(i) if i.isnumeric() else i for i in x]
    return ['<num-%s>' % len(i) if not_special(i) and is_numeric(i) else i for i in x]

#### convert num + char

In [29]:
def is_charnum(s):
    
    has_num = bool(re.findall(r'[0-9]+', s))
    has_char = bool(re.findall(r'[a-z]+', s))
    hasnot_other = not bool(re.findall(r'[^a-z0-9]+', s))
    
    return has_num and has_char and hasnot_other

In [30]:
def is_v_num(s):
    return bool(re.match(r'v[0-9]+', s))

In [31]:
def is_vx_num(s):
    return bool(re.match(r'vx[0-9]+', s))

In [32]:
def is_qq_num(s):
    return bool(re.match(r'qq[0-9]+', s))

In [33]:
def __convert_chars_numbers(s):
    
    if is_special(s): return [s]
    
    if is_v_num(s): return ['微', '<contact>']
    if is_vx_num(s): return ['微', '<contact>']
    if is_qq_num(s): return ['微', '<contact>']
    
    if is_charnum(s): return ['<contact>']
    
    return [s]
    

In [34]:
def convert_chars_numbers(x):
    # return ['<contact>' if not_special(i) and is_charnum(i) else i for i in x]
    return list(chain.from_iterable([__convert_chars_numbers(s) for s in x]))

#### split char, num, chinese + special

In [35]:
def __split_normal_special(s):
    if is_special(s): return s, ''
    
    # TODO: wheather , . ，。blank should be in valid
    return ''.join(re.findall(r'[,\+a-z0-9\u4e00-\u9fa5]+', s)), ''.join(re.findall(r'[^,\+a-z0-9\u4e00-\u9fa5]+', s))

In [36]:
def split_normal_special(x):
    
    r = [__split_normal_special(i) for i in x]
    
    r1, r2 = zip(*r)
    r1 = [i for i in r1 if i]
    r2 = [i for i in r2 if i]
    
    return r1, r2

#### split naive

In [37]:
def split_naive(s):
    # return jieba.lcut(s)
    # return jieba.lcut(s, cut_all=True)
    # return jieba.lcut_for_search(s)
    return list(s)
    
    # return jieba.lcut(s, cut_all=True) + list(s)
    # return jieba.lcut_for_search(s) + list(s)

#### split once

In [38]:
def __split_once(s):
    if is_special(s): return [s]
    
    # s1, s2 = split_normal_special(s)
    
    # return __split0(s1) + __split0(s2)
    # return split_naive(s1) + list(s2)
    
    # r = split_naive(s1) + list(s)
    
    # r1 = split_naive(s1)
    # r2 = list(s)
    # r = r1 + list(set(r2)-set(r1))
    
    # s = replace(s)
    # r = list(s)
    r = split_naive(s)
    
    return r

In [39]:
def split_once(x):
    return list(chain.from_iterable([__split_once(s) for s in x]))

#### stop words

In [40]:
stopwords1_usual = ['你', '我', '他', '她', '它', '们',
                    '吧', '吗', '嘛', '啊', '阿', '呢', '呀',
                    '的', '地', 
                    '怎', '么',
                    '那', '哪',
                    '就', '没', '了', '谢', '配', '合']

In [41]:
stopwords1 = stopwords1_usual

In [42]:
stopwords = stopwords1

In [43]:
def filter_stopwords(x):
    return [i for i in x if i not in stopwords]

### split 1

In [44]:
def split1(s):
    # preprocess
    s = s.replace(' ', '')    # TODO: maybe all blank
    s = s.lower()
    s = unicodedata.normalize('NFKC', s)
    
    # formated
    tokens = split_location(s)
    
    # normal and special-char
    tokens, tokens_special = split_normal_special(tokens)
    
    # user defined
    tokens = split_charnum(tokens)
    tokens = convert_num(tokens)
    tokens = convert_chars_numbers(tokens)
    
    # link
    tokens = replace(tokens)
    
    # split
    tokens = split_terminology(tokens, '收人', '<recruit>')
    tokens = split_terminology(tokens, '军团', '<corpus>')
    tokens = split_terminology(tokens, '团长', '<colonel>')
    tokens = split_once(tokens)
    
    # filter
    tokens = filter_stopwords(tokens)
    
    # merge tokens_special
    # tokens += list(''.join(tokens_special))
    tokens += ['<special-char>'] * len(''.join(tokens_special))
        
    return tokens

### high freq

### low freq

In [45]:
print("Extracting features from the training data using a sparse vectorizer")

t0 = time()

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=1e-3, tokenizer=split1)
tfidf = vectorizer.fit_transform(X_train)
        
duration = time() - t0

print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % tfidf.shape)
print()

Extracting features from the training data using a sparse vectorizer
done in 1.640201s at 0.510MB/s
n_samples: 20000, n_features: 980



In [46]:
feature_names = vectorizer.get_feature_names()

In [47]:
with open('./model/ads-detect-1-20200125.vocab', 'wb') as f:
    pickle.dump(feature_names, f)

In [48]:
def low_freq(x):
    return ['<unk>' if i not in feature_names else i for i in x]

### split 2

In [49]:
def split2(s):
    # return low_freq(split1(s))

    tokens = split1(s)
    # tokens = high_freq(tokens)
    tokens = low_freq(tokens)
    
    return tokens

### split test

In [50]:
data_train['tokens'] = data_train['content'].apply(split2)

## feature

In [51]:
print("Extracting features from the training data using a sparse vectorizer")

t0 = time()

vectorizer = TfidfVectorizer(sublinear_tf=True, tokenizer=split2)
X_train = vectorizer.fit_transform(X_train)
    
    
duration = time() - t0

print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

Extracting features from the training data using a sparse vectorizer
done in 3.236342s at 0.259MB/s
n_samples: 20000, n_features: 981



In [52]:
print("Extracting features from the test data using the same vectorizer")

t0 = time()
X_test = vectorizer.transform(X_test)
duration = time() - t0

print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

Extracting features from the test data using the same vectorizer
done in 0.208551s at 0.293MB/s
n_samples: 1017, n_features: 981



In [53]:
with open('./model/ads-detect-1-20200125.emb', 'wb') as f:
    pickle.dump(vectorizer, f)

In [54]:
# get_support(indices=False)
# Get a mask, or integer index, of the features selected
#
# indicesbool, default=False
# If True, the return value will be an array of integers, rather than a boolean mask.

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" % opts.select_chi2)
    
    t0 = time()
    
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    
    if feature_names: feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
        
    print("done in %fs" % (time() - t0))
    print()

In [55]:
if feature_names: feature_names = np.asarray(feature_names)

# Classify

In [56]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [57]:
def fit(clf, X_train, y_train):
    
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    return train_time

In [58]:
def predict(clf, X_test):
    
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    
    return pred, test_time

In [59]:
def accuracy(y_test, pred):
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    
    return score

In [60]:
def print_coef(clf, opts, feature_names):
    # TODO:
    
    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            
            #for i, label in enumerate(target_names):
              #  top10 = np.argsort(clf.coef_[i])[-10:]
                #print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

In [61]:
def print_report(opts, y_test, pred):
    
    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred))

In [62]:
def print_cm(opts, y_test, pred):
    
    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

In [63]:
def get_desc(clf):
    return str(clf).split('(')[0]

In [64]:
def __benchmark(X_train, y_train, X_test, y_test, opts, feature_names, clf):
    
    train_time = fit(clf, X_train, y_train)
    pred, test_time = predict(clf, X_test)
    score = accuracy(y_test, pred)
    clf_descr = get_desc(clf)

    print_coef(clf, opts, feature_names)
    print_report(opts, y_test, pred)
    print_cm(opts, y_test, pred)
    print()
    
    return pred, clf_descr, score, train_time, test_time

In [65]:
benchmark = partial(__benchmark, X_train, y_train, X_test, y_test, opts, feature_names)

## score

In [66]:
results = []

In [67]:
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(solver='sag', tol=0.01)
train time: 0.080s
test time:  0.001s
accuracy:   0.925
dimensionality: 981
density: 1.000000

classification report:
              precision    recall  f1-score   support

          -1       0.95      0.80      0.87       313
           1       0.92      0.98      0.95       704

    accuracy                           0.93      1017
   macro avg       0.93      0.89      0.91      1017
weighted avg       0.93      0.93      0.92      1017


Perceptron
________________________________________________________________________________
Training: 
Perceptron(max_iter=50)
train time: 0.017s
test time:  0.001s
accuracy:   0.918
dimensionality: 981
density: 0.963303

classification report:
              precision    recall  f1-score   support

          -1       0.86      0.88      0.87       313
           1       0.95      0.94      0.94       7



test time:  0.666s
accuracy:   0.899
classification report:
              precision    recall  f1-score   support

          -1       0.89      0.77      0.82       313
           1       0.90      0.96      0.93       704

    accuracy                           0.90      1017
   macro avg       0.89      0.86      0.88      1017
weighted avg       0.90      0.90      0.90      1017


Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier()
train time: 5.950s
test time:  0.039s
accuracy:   0.939
classification report:
              precision    recall  f1-score   support

          -1       0.95      0.85      0.90       313
           1       0.94      0.98      0.96       704

    accuracy                           0.94      1017
   macro avg       0.94      0.91      0.93      1017
weighted avg       0.94      0.94      0.94      1017




In [68]:
for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty)))

L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(dual=False, tol=0.001)
train time: 0.044s
test time:  0.001s
accuracy:   0.945
dimensionality: 981
density: 1.000000

classification report:
              precision    recall  f1-score   support

          -1       0.94      0.87      0.91       313
           1       0.95      0.98      0.96       704

    accuracy                           0.94      1017
   macro avg       0.94      0.92      0.93      1017
weighted avg       0.94      0.94      0.94      1017


________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50)
train time: 0.019s
test time:  0.001s
accuracy:   0.942
dimensionality: 981
density: 0.985729

classification report:
              precision    recall  f1-score   support

          -1       0.96      0.85      0.90       313
           1       0.94      0.98      0.96       704

    accuracy     

In [69]:
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet")))

Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(max_iter=50, penalty='elasticnet')
train time: 0.034s
test time:  0.000s
accuracy:   0.939
dimensionality: 981
density: 0.661570

classification report:
              precision    recall  f1-score   support

          -1       0.96      0.84      0.89       313
           1       0.93      0.98      0.96       704

    accuracy                           0.94      1017
   macro avg       0.94      0.91      0.93      1017
weighted avg       0.94      0.94      0.94      1017




In [70]:
# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid()
train time: 0.006s
test time:  0.001s
accuracy:   0.857
classification report:
              precision    recall  f1-score   support

          -1       0.79      0.73      0.76       313
           1       0.89      0.91      0.90       704

    accuracy                           0.86      1017
   macro avg       0.84      0.82      0.83      1017
weighted avg       0.86      0.86      0.86      1017




In [71]:
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01)
train time: 0.006s
test time:  0.000s
accuracy:   0.864
dimensionality: 981
density: 1.000000

classification report:
              precision    recall  f1-score   support

          -1       0.85      0.68      0.76       313
           1       0.87      0.94      0.91       704

    accuracy                           0.86      1017
   macro avg       0.86      0.81      0.83      1017
weighted avg       0.86      0.86      0.86      1017


________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01)
train time: 0.006s
test time:  0.001s
accuracy:   0.869
dimensionality: 981
density: 1.000000

classification report:
              precision    recall  f1-score   support

          -1       0.85      0.69      0.77       313
           1       0.87      0.95      0.91       704

    accuracy              

In [72]:
print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))

LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, penalty='l1',
                                                     tol=0.001))),
                ('classification', LinearSVC())])
train time: 0.374s
test time:  0.001s
accuracy:   0.948
classification report:
              precision    recall  f1-score   support

          -1       0.95      0.88      0.91       313
           1       0.95      0.98      0.96       704

    accuracy                           0.95      1017
   macro avg       0.95      0.93      0.94      1017
weighted avg       0.95      0.95      0.95      1017




In [73]:
param = {'num_leaves': 2**5-1, 'objective': 'binary'}
param['metric'] = 'auc'

num_round = 1000

In [74]:
X_train.shape

(20000, 981)

In [75]:
length = np.expand_dims(data_train['content'].apply(len).to_numpy(), axis=1)
print(X_train.shape)
print(length.shape)
X_train = csr_matrix(np.concatenate((X_train.toarray(), length), axis=1))
X_train.shape

(20000, 981)
(20000, 1)


(20000, 982)

In [76]:
length = np.expand_dims(data_test['content'].apply(len).to_numpy(), axis=1)
print(X_test.shape)
print(length.shape)
X_test = csr_matrix(np.concatenate((X_test.toarray(), length), axis=1))
X_test.shape

(1017, 981)
(1017, 1)


(1017, 982)

In [77]:
train_data = lgb.Dataset(X_train, label=y_train)
bst = lgb.train(param, train_data, num_round)

y_pred = bst.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, -1)

[LightGBM] [Info] Number of positive: 16100, number of negative: 3900
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50997
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 982
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.805000 -> initscore=1.417843
[LightGBM] [Info] Start training from score 1.417843


In [78]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.97      0.89      0.93       313
           1       0.95      0.99      0.97       704

    accuracy                           0.96      1017
   macro avg       0.96      0.94      0.95      1017
weighted avg       0.96      0.96      0.96      1017



In [79]:
bst.save_model('./model/ads-detect-1-20200125.mdl')

<lightgbm.basic.Booster at 0x7f58492bc5e0>

## efficiency

## tune

In [80]:
data_test['pred'] = y_pred

In [81]:
df = data_test[['label', 'pred', 'content']]
df['tokens'] = df['content'].apply(split2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['content'].apply(split2)


In [82]:
df1 = df[(~(df.label==df.pred)) & (df.label==-1)]
print(df1.shape)
df1

(33, 4)


Unnamed: 0,label,pred,content,tokens
61,-1,1,收个功勋号！,"[收, 个, 功, 勋, 号, <special-char>]"
122,-1,1,兄弟来新游戏试玩不？人多**好,"[兄, 弟, 来, 新, 游, 戏, 试, 玩, 不, 人, 多, 好, <special-char>, <special-char>, <special-char>]"
128,-1,1,加微信好友。,"[+, 微, 好, 友, <special-char>]"
237,-1,1,嫁一下**徽信xsw767拉你进群，领取礼包大家一起玩,"[+, 微, <contact>, 拉, 进, 群, ,, 领, 取, 礼, 包, 大, 家, 一, 起, 玩, <special-char>, <special-char>]"
241,-1,1,你那还有礼包码吗,"[还, 有, 礼, 包, 码]"
244,-1,1,你打公告我打资源号位置,"[打, 公, 告, 打, 资, 源, 号, 位, 置]"
250,-1,1,我下装备你过来，不行再下将,"[下, 装, 备, 过, 来, ,, 不, 行, 再, 下, 将]"
258,-1,1,战力9-12万v1想刷军功的私聊我,"[战, 力, <num-3>, 万, 微, <contact>, 想, 刷, 军, 功, 私, 聊, <special-char>]"
264,-1,1,不是，我说的让你们看看要不要资源,"[不, 是, ,, 说, 让, 看, 看, 要, 不, 要, 资, 源]"
314,-1,1,招手游代理或兼职，拥有个人代理后台，周结算，0加盟费，合作即可签合同，公司扶持，想了解的可以咨询我。,"[招, 手, 游, 代, 理, 或, <unk>, <unk>, ,, <unk>, 有, 个, 人, 代, 理, 后, 台, ,, <unk>, 结, 算, ,, <num-1>, +, 盟, 费, ,, 作, 即, 可, <unk>, 同, ,, 公, 司, <unk>, 持, ,, 想, 解, 可, 以, <unk>, <unk>, <special-char>]"


In [83]:
df2 = df[(~(df.label==df.pred)) & (df.label==1)]
print(df2.shape)
df2

(8, 4)


Unnamed: 0,label,pred,content,tokens
99,1,-1,霸服军团收吴，魏，蜀国活人，来的加 1979574312私聊,"[霸, 服, <corpus>, 收, 吴, ,, 魏, ,, 蜀, 国, 人, ,, 来, +, <num-10>, 私, 聊]"
100,1,-1,新区195收人。，最强吴国军团。加weixin=13288669123,"[新, 区, <num-3>, <recruit>, ,, 最, 强, 吴, 国, <corpus>, +, <contact>, <special-char>, <special-char>, <special-char>]"
211,1,-1,能告诉我 开区啥价格 现在啥价格么,"[能, 告, 诉, 开, 区, 啥, 价, 格, 现, 在, 啥, 价, 格]"
454,1,-1,军团要整顿，加我微信，我拉你进核心群：571773352,"[<corpus>, 要, 整, <unk>, ,, +, 微, ,, 拉, 进, <unk>, 心, 群, <num-9>, <special-char>]"
513,1,-1,兄弟在吗？我是灰长小号，十下我徽KLDY0855，进交流裙，没十的一会当机器人T了,"[兄, 弟, 在, 是, <unk>, 长, 小, 号, ,, +, 微, <contact>, ,, 进, 交, 流, 裙, ,, +, 一, 会, 当, 机, 器, 人, t, <special-char>]"
519,1,-1,没有微信号了，你那有吗,"[有, 微, 号, ,, 有]"
689,1,-1,酒馆收活人，来的私聊,"[酒, <unk>, <recruit>, ,, 来, 私, 聊]"
848,1,-1,资源号,"[资, 源, 号]"
