In [None]:
import re
import pickle
import unicodedata

In [None]:
from time import time
from itertools import chain
from itertools import zip_longest
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
%run dataset.ipynb

## Load

In [None]:
%store -r data_train
%store -r X_train
%store -r data_train_size_mb

## Preprocess

In [None]:
def is_special(i):
    
    is_1 =  i in ['<unk>', '<loc>', '<contact>', '<recruit>', '<corpus>', '<colonel>']
    is_2 = bool(re.match(r'^<num-[0-9]+>$', i))
    
    return is_1 or is_2

In [None]:
def not_special(i):
    return not is_special(i)

### replace

In [None]:
def __replace(s):
    if is_special(s): return s
    
    # 微信
    s = s.replace('威', '微')
    s = s.replace('徽', '微')
    s = s.replace('徵', '微')
    s = s.replace('亻言', '信')
    
    s = s.replace('微新', '微信')
    s = s.replace('微信', '微')
    
    # 加
    s = s.replace('咖', '加')
    s = s.replace('架', '加')
    s = s.replace('嫁', '加')
    s = s.replace('十', '加')
    s = s.replace('茄', '加')
    s = s.replace('迦', '加')
    
    s = s.replace('加下', '加')
    s = s.replace('加一下', '加')
    
    s = s.replace('加', '+')
    
    # 收人
    s = s.replace('活人', '人')
    # s = s.replace('收人', '<recruit>')
    
    # 团长
    s = s.replace('圕', '团')
    
    # 充
    s = s.replace('冲', '充')
    s = s.replace('直充', '充')
    
    # 出
    s = s.replace('础', '出')

    # 卖
    s = s.replace('麦', '出')
    
    return s

In [None]:
def replace(x):
    return [__replace(i) for i in x]

### split util

In [None]:
def split_regex(s, reg, flag):
    r = re.split(reg, s)
    if 1 >= len(r): return r
    r = list(chain.from_iterable(zip_longest(r[:-1], [], fillvalue=flag))) + r[-1:]
    return [i for i in r if i]

#### split location

In [None]:
def split_location(s):
    return split_regex(s, r'{localization:[0-9]+\-[0-9]+}', '<loc>')

#### split terminology

In [None]:
def __split_terminology(s, term, flag):
    if is_special(s): return [s]
    return split_regex(s, r'%s' % term, flag)

In [None]:
def split_terminology(x, term, flag):
    return list(chain.from_iterable([__split_terminology(i, term, flag) for i in x]))

#### split coordinates

In [None]:
# TODO:

#### split num + char

In [None]:
def __split_charnum(s):
    if is_special(s): return [s]
    return [c for c in re.split(r'([0-9a-z]+)', s) if c]

In [None]:
def split_charnum(x):
    return list(chain.from_iterable([__split_charnum(s) for s in x]))

#### convert num

In [None]:
def is_numeric(s):
    
    has_num = bool(re.findall(r'[0-9]+', s))
    hasnot_other = not bool(re.findall(r'[^0-9]+', s))
    
    return has_num and hasnot_other

In [None]:
def convert_num(x):
    """ tool
    """
    
    # return ['<num-%s>' % len(i) if i.isnumeric() else i for i in x]
    return ['<num-%s>' % len(i) if not_special(i) and is_numeric(i) else i for i in x]

#### convert num + char

In [None]:
def is_charnum(s):
    
    has_num = bool(re.findall(r'[0-9]+', s))
    has_char = bool(re.findall(r'[a-z]+', s))
    hasnot_other = not bool(re.findall(r'[^a-z0-9]+', s))
    
    return has_num and has_char and hasnot_other

In [None]:
def is_v_num(s):
    return bool(re.match(r'v[0-9]+', s))

In [None]:
def is_vx_num(s):
    return bool(re.match(r'vx[0-9]+', s))

In [None]:
def is_qq_num(s):
    return bool(re.match(r'qq[0-9]+', s))

In [None]:
def __convert_chars_numbers(s):
    
    if is_special(s): return [s]
    
    if is_v_num(s): return ['微', '<contact>']
    if is_vx_num(s): return ['微', '<contact>']
    if is_qq_num(s): return ['微', '<contact>']
    
    if is_charnum(s): return ['<contact>']
    
    return [s]
    

In [None]:
def convert_chars_numbers(x):
    # return ['<contact>' if not_special(i) and is_charnum(i) else i for i in x]
    return list(chain.from_iterable([__convert_chars_numbers(s) for s in x]))

#### split char, num, chinese + special

In [None]:
def __split_normal_special(s):
    if is_special(s): return s, ''
    
    # TODO: wheather , . ，。blank should be in valid
    return ''.join(re.findall(r'[,\+a-z0-9\u4e00-\u9fa5]+', s)), ''.join(re.findall(r'[^,\+a-z0-9\u4e00-\u9fa5]+', s))

In [None]:
def split_normal_special(x):
    
    r = [__split_normal_special(i) for i in x]
    
    r1, r2 = zip(*r)
    r1 = [i for i in r1 if i]
    r2 = [i for i in r2 if i]
    
    return r1, r2

#### split naive

In [None]:
def split_naive(s):
    # return jieba.lcut(s)
    # return jieba.lcut(s, cut_all=True)
    # return jieba.lcut_for_search(s)
    return list(s)
    
    # return jieba.lcut(s, cut_all=True) + list(s)
    # return jieba.lcut_for_search(s) + list(s)

#### split once

In [None]:
def __split_once(s):
    if is_special(s): return [s]
    
    # s1, s2 = split_normal_special(s)
    
    # return __split0(s1) + __split0(s2)
    # return split_naive(s1) + list(s2)
    
    # r = split_naive(s1) + list(s)
    
    # r1 = split_naive(s1)
    # r2 = list(s)
    # r = r1 + list(set(r2)-set(r1))
    
    # s = replace(s)
    # r = list(s)
    r = split_naive(s)
    
    return r

In [None]:
def split_once(x):
    return list(chain.from_iterable([__split_once(s) for s in x]))

#### stop words

In [None]:
stopwords1_usual = ['你', '我', '他', '她', '它', '们',
                    '吧', '吗', '嘛', '啊', '阿', '呢', '呀',
                    '的', '地', 
                    '怎', '么',
                    '那', '哪',
                    '就', '没', '了', '谢', '配', '合']

In [None]:
stopwords1 = stopwords1_usual

In [None]:
stopwords = stopwords1

In [None]:
def filter_stopwords(x):
    return [i for i in x if i not in stopwords]

### split 1

In [None]:
def split1(s):
    # preprocess
    s = s.replace(' ', '')    # TODO: maybe all blank
    s = s.lower()
    s = unicodedata.normalize('NFKC', s)
    
    # formated
    tokens = split_location(s)
    
    # normal and special-char
    tokens, tokens_special = split_normal_special(tokens)
    
    # user defined
    tokens = split_charnum(tokens)
    tokens = convert_num(tokens)
    tokens = convert_chars_numbers(tokens)
    
    # link
    tokens = replace(tokens)
    
    # split
    tokens = split_terminology(tokens, '收人', '<recruit>')
    tokens = split_terminology(tokens, '军团', '<corpus>')
    tokens = split_terminology(tokens, '团长', '<colonel>')
    tokens = split_once(tokens)
    
    # filter
    tokens = filter_stopwords(tokens)
    
    # merge tokens_special
    # tokens += list(''.join(tokens_special))
    tokens += ['<special-char>'] * len(''.join(tokens_special))
        
    return tokens

### high freq

### low freq

In [None]:
print("Extracting features from the training data using a sparse vectorizer")

t0 = time()

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=1e-3, tokenizer=split1)
tfidf = vectorizer.fit_transform(X_train)
        
duration = time() - t0

print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % tfidf.shape)
print()

In [None]:
feature_names = vectorizer.get_feature_names()

In [None]:
with open('./model/ads-detect-1-20200125.vocab', 'wb') as f:
    pickle.dump(feature_names, f)

In [None]:
def low_freq(x):
    return ['<unk>' if i not in feature_names else i for i in x]

### split 2

In [None]:
def split2(s):
    # return low_freq(split1(s))

    tokens = split1(s)
    # tokens = high_freq(tokens)
    tokens = low_freq(tokens)
    
    return tokens

### split test

In [None]:
data_train['tokens'] = data_train['content'].apply(split2)

In [None]:
data_train.head(100)