In [15]:
import re
import json
import os
import csv
import sys

from multiprocessing import Pool

from nltk.tokenize import wordpunct_tokenize
import numpy as np
import random
from gensim.models.wrappers.fasttext import FastText as FT_wrapper

import pickle
from pathlib import Path

from pymystem3 import Mystem
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
m = Mystem()


def mystem_analyze(str):
    global m
    try:
        return m.analyze(str)
    except BrokenPipeError as ex:
        m = Mystem()
        return mystem_analyze(str)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

csv.field_size_limit(sys.maxsize)

DATASETS = Path('~/data/taskdialog').expanduser()
MODELS = Path('~/data/taskdialog/models').expanduser()

In [51]:
negatives = []

In [57]:
def preprocess_forum(f):
    i = 0
    for line in f:
        i += 1
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in json.loads(line)['text'].strip().splitlines() if l.strip())
        if t and len(t) < 4000:
            yield t
        if i % 1000000 == 0:
            print(i)


In [53]:
%%time
with (DATASETS / 'forummoskva.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)

1000000
CPU times: user 29.8 s, sys: 1.2 s, total: 31 s
Wall time: 31 s


In [58]:
%%time
with (DATASETS / 'dromru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
CPU times: user 7min 16s, sys: 20.7 s, total: 7min 37s
Wall time: 7min 37s


In [54]:
%%time
with (DATASETS / 'vashdomru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
#     print(len(texts))  # 300k

CPU times: user 4.67 s, sys: 384 ms, total: 5.05 s
Wall time: 5.05 s


In [55]:
%%time
with (DATASETS / 'antiwomenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
#     print(len(texts))  # 1.8m

CPU times: user 5.96 s, sys: 286 ms, total: 6.25 s
Wall time: 6.25 s


In [56]:
%%time
with (DATASETS / 'womenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
#     print(len(texts))

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
CPU times: user 6min 29s, sys: 14.7 s, total: 6min 43s
Wall time: 6min 43s


In [59]:
print(len(negatives))
random.shuffle(negatives)

30147889


In [16]:
def to_imperative(word):
    try:
        p = [pm for pm in morph.parse(word) if 'INFN' in pm.tag][0]
    except IndexError as ex:
        return
    try:
        sing = p.inflect({'VERB', 'perf', 'impr', 'excl', 'plur'}).word
        plur = p.inflect({'VERB', 'perf', 'impr', 'excl'}).word
        return (sing, plur)
    except AttributeError as ex:
        return
    
assert to_imperative('Совершить') == ('совершите', 'соверши')
    
words_of_need = {'необходимо', 'нужно', 'требуется'}

def get_imperative_variants(text):
    try:
        words = []
        isinf = []
        for tok in mystem_analyze(text.lower()):
            if 'analysis' in tok:
                if len(tok['analysis']) >= 1:
                    gram = tok['analysis'][0]['gr']
                    infinitive = 'инф' in gram and 'V' in gram
                    w = tok['text']
                    words.append(w)
                    isinf.append(infinitive)
        if max(isinf) == False:
            return text,
        if isinf[0]:
            variants = []
            for w, v in zip(words, isinf):
                if v:
                    imp = to_imperative(w)
                    variants.append(imp or (w, w))
                else:
                    variants.append((w, w))
            sing, plur = zip(*variants)
            return text, ' '.join(sing), ' '.join(plur)
        elif words_of_need.intersection(words):
            prev_word = ''
            variants = []
            used_imperative = False
            for w, v in zip(words, isinf):
                if v and prev_word in words_of_need:
                    imp = to_imperative(w)
                    if imp:
                        variants.pop()
                        variants.append(imp)
                        used_imperative = True
                    else:
                        variants.append((w, w))

                else:
                    variants.append((w, w))
                prev_word = w

            if used_imperative:
                sing, plur = zip(*variants)
                return text, ' '.join(sing), ' '.join(plur)
    except ValueError as ex:
        pass
    return text,
        
assert get_imperative_variants('нужно сделать хорошо')[1] == 'сделайте хорошо'
assert get_imperative_variants('повертеть попой')[2] == 'поверти попой'
assert len(get_imperative_variants('хорошо сделать')) == 1

In [11]:
%%time

positives = []
with (DATASETS / 'youdo.txt').open() as f:
    csvr = csv.reader(f, delimiter=',')
    for row in csvr:
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in row[1].strip().splitlines() if l)
        positives.append((row[0], t))
        
with (DATASETS / 'fl.csv').open() as f:
    c = csv.reader(f)
    positives.extend(c)  
    
random.shuffle(positives)

train_tasks_n = int(len(positives) * 0.8)

positives_train = positives[:train_tasks_n]
positives_test = positives[train_tasks_n:]

CPU times: user 1min 14s, sys: 3.6 s, total: 1min 18s
Wall time: 1min 18s


In [60]:
len(positives)

4162153

In [18]:
%%time
with Pool(processes=40) as pool:
    augmented_body = pool.map(get_imperative_variants, (body for title, body in positives_train))

CPU times: user 24.6 s, sys: 24.6 s, total: 49.3 s
Wall time: 4min 53s


In [21]:
%%time

with (DATASETS / 'tasks_train.csv').open('w') as f:
    c = csv.writer(f)
    for (title, body), augs in zip(positives_train, augmented_body):
        for a in augs:
            c.writerow([title, a])

with (DATASETS / 'tasks_test.csv').open('w') as f:
    c = csv.writer(f)
    for title, body in positives_test:
        c.writerow([title, body])

CPU times: user 59.2 s, sys: 6.5 s, total: 1min 5s
Wall time: 1min 6s


In [69]:
augmented_body[2][0]

'Пропала кошка в Красногорске . Район Медучилище . Срочно нужно пару человек на поиски . Подробности по телефону . За нахождение отдельное вознаграждение . Поиск будет вестись в окрестностях указанного адреса .\nБолее подробно условия задания обсудим с исполнителем .\nВ предложениях указывайте сроки , когда сможете выполнить задание и цену за работу .\nЖду Ваших предложений !'

In [70]:
%%time

negatives_train_n = int(len(negatives) * 0.8)
train = [(0, body) for body in negatives[:negatives_train_n]] + [(1, body[0]) for body in augmented_body]
random.shuffle(train)

with (DATASETS / 'train3.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')
    for label, body in train:
        csvw.writerow([label, body])

        
test = [(0, body) for body in negatives[negatives_train_n:]] + [(1, body) for title, body in positives_test]
random.shuffle(test)
with (DATASETS / 'test3.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')
    for label, body in test:
        csvw.writerow([label, body])


CPU times: user 3min 38s, sys: 26.7 s, total: 4min 4s
Wall time: 4min 5s


In [78]:
train_text = []
train_y = []
with (DATASETS / 'train3.tsv').open() as f:
    c = csv.reader(f, delimiter='\t')
    for label, body in c:
        train_text.append(body[:500])
        train_y.append(int(label))

test_text = []
test_y = []
with (DATASETS / 'test3.tsv').open() as f:
    c = csv.reader(f, delimiter='\t')
    for label, body in c:
        test_text.append(body[:500])
        test_y.append(int(label))

In [31]:
# with (DATASETS / 'dataset.txt').open('w') as f:
#     for label, text in data:
#         print(text, file=f)

In [None]:
# train_n = int(len(data) * 0.8)
# count = 0
# with (DATASETS / 'train2.tsv').open('w') as f:
#     csvw = csv.writer(f, delimiter='\t')
#     for label, text in data[:train_n]:
#         count += 1
#         if count % 20000 == 0:
#             print('{:.2f}%'.format(count/train_n*100))
#         if label == '1':
#             for augmented_text in get_imperative_variants(text):
#                 csvw.writerow([label, augmented_text])
#         else:
#             csvw.writerow([label, text])

# with (DATASETS / 'test.tsv').open('w') as f:
#     csvw = csv.writer(f, delimiter='\t')
#     for label, text in data[train_n:]:
#         csvw.writerow([label, text])



In [3]:
# train_X = []
# train_y = []
# with (DATASETS / 'train2.tsv').open() as f:
#     csvr = csv.reader(f, delimiter='\t')
#     for label, text in csvr:
#         train_X.append(text)
#         train_y.append(int(label))
        
# train_y = np.array(train_y)

In [123]:
stopwords = '''форумчане
форумчане
simplexman
saitov
дрома
drom
quote
дром
професcиональный
рекомендую
советую
взаимосвязями
тебе
загляни
youdo
фрилансеры'''.split('\n')

In [73]:
%%time
tfidf_path = MODELS / 'tfidf4.pickle'
if tfidf_path.exists():
    with tfidf_path.open('rb') as f:
        tfidf = pickle.load(f)
else:
    tfidf = TfidfVectorizer(min_df=5, stop_words=stopwords)
    tfidf.fit(train_text)
    with tfidf_path.open('wb') as f:
        pickle.dump(tfidf, f)

train_X = tfidf.transform(train_text)
test_X = tfidf.transform(test_text)

CPU times: user 12min, sys: 51.8 s, total: 12min 52s
Wall time: 12min 52s


In [74]:
%%time

classifier = LinearSVC()
classifier.fit(train_X, train_y)

CPU times: user 3min 23s, sys: 31 s, total: 3min 54s
Wall time: 3min 54s


In [75]:
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])
with (MODELS / 'classifier3.pickle').open('wb') as f:
    pickle.dump(pipe, f)

In [81]:
print('Accuracy  ', (classifier.predict(test_X) == test_y).mean(), (classifier.predict(train_X) == train_y).mean(), sep='\t')
test_pos = test_y == 1
train_pos = train_y == 1
print('Pos Accuracy', (classifier.predict(train_X[train_pos]) == train_y[train_pos]).mean(), (classifier.predict(test_X[test_pos]) == test_y[test_pos]).mean(), sep='\t')
print('Neg Accuracy', (classifier.predict(train_X[~train_pos]) == train_y[~train_pos]).mean(), (classifier.predict(test_X[~test_pos]) == test_y[~test_pos]).mean(), sep='\t')

# 1
#   Accuracy	0.9966766041670294	0.9979463617801813
# Pos Accuracy	0.974006711709939	0.9588633251535406
# Neg Accuracy	0.9993740346497515	0.9989338185724046

Accuracy  	0.993786222081609	0.9956866854539267
Pos Accuracy	1.0	1.0
Neg Accuracy	1.0	1.0


In [114]:
id2words = {i: w for w, i in tfidf.vocabulary_.items()}
words = []
for i in range(classifier.coef_.shape[1]):
    words.append((id2words[i], classifier.coef_[0][i]))

In [117]:
importances = sorted(words, key=lambda x: x[1], reverse=True)

In [122]:
importances[:1000]

[('youdo', 8.941281726674383),
 ('исполнителем', 6.597834953856674),
 ('предложений', 6.162757613149846),
 ('доставить', 5.366664548092651),
 ('лендинг', 5.205851255482394),
 ('фрилансеры', 5.13254119809091),
 ('отвезти', 4.965206489235887),
 ('исполнителю', 4.91407563823814),
 ('необходимо', 4.894193876437587),
 ('логотип', 4.744787408945254),
 ('отрисовать', 4.743880283574469),
 ('тз', 4.632125242699361),
 ('тематика', 4.590889820758714),
 ('сверстать', 4.561210322145058),
 ('спарсить', 4.538011061704241),
 ('yakooobin', 4.391137763061858),
 ('разработать', 4.358539638293091),
 ('dle', 4.3530010165252495),
 ('аттаче', 4.319615148632556),
 ('скрипт', 4.256167282179173),
 ('задание', 4.2483603203128535),
 ('joomla', 4.204054256900099),
 ('парсер', 4.141631169310876),
 ('opencart', 4.136287353513765),
 ('одностраничник', 4.121287481554456),
 ('админке', 4.1155830380705964),
 ('юду', 4.105792659608459),
 ('modx', 4.100047233960297),
 ('сбр', 4.04687668328223),
 ('логотипе', 4.04687021272

In [33]:
model_wrapper = FT_wrapper.train('/home/marat/fastText-0.1.0/fasttext', corpus_file=str(DATASETS / 'dataset.txt'))

In [36]:
model_wrapper.save('/home/marat/data/taskdialog/models/fasttext.bin')

In [18]:
model_wrapper.most_similar('=(')

[('Юлус', 0.93408203125),
 ('=(,', 0.8703312277793884),
 ('=("', 0.869441032409668),
 (';(', 0.8526694178581238),
 (':(:(:("', 0.8471227288246155),
 (':(', 0.8459991216659546),
 ('дальше:(', 0.8446086049079895),
 (':("', 0.8394126296043396),
 ('((', 0.838272213935852),
 (':(:(:(', 0.8340241312980652)]

In [47]:
train_n = int(len(data) * 0.8)

with (DATASETS / 'train.csv').open('w') as f:
    csvw = csv.writer(f)
    csvw.writerow(['istask', 'request'])
    for label, text in data[:train_n]:
        csvw.writerow([label, text])

with (DATASETS / 'test.csv').open('w') as f:
    csvw = csv.writer(f)
    csvw.writerow(['istask', 'request'])
    for label, text in data[train_n:]:
        csvw.writerow([label, text])

In [46]:
!nvidia-smi

Tue Feb 13 01:23:16 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.12                 Driver Version: 390.12                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   28C    P0    42W / 300W |      2MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    42W / 300W |   2011MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla P100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   