In [32]:
import re
import json
import os
import csv
import sys

from nltk.tokenize import wordpunct_tokenize
import numpy as np
import random
from gensim.models.wrappers.fasttext import FastText as FT_wrapper

import pickle
from pathlib import Path

from pymystem3 import Mystem
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
m = Mystem()


def mystem_analyze(str):
    global m
    try:
        return m.analyze(str)
    except BrokenPipeError as ex:
        m = Mystem()
        return mystem_analyze(str)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

csv.field_size_limit(sys.maxsize)

DATASETS = Path('~/data/taskdialog').expanduser()
MODELS = Path('~/data/taskdialog/models').expanduser()

In [2]:
negatives = []

In [3]:
def preprocess_forum(f):
    lines = []
    for line in f:        
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in json.loads(line)['text'].strip().splitlines() if l.strip())
        if t:
            lines.append(t)
    return lines
    

In [5]:
with (DATASETS / 'forummoskva.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)
    print(len(texts))

1238816


In [6]:
with (DATASETS / 'dromru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)
    print(len(texts))    

15722973


In [7]:
with (DATASETS / 'vashdomru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
    print(len(texts))  # 300k

65593


In [8]:
with (DATASETS / 'antiwomenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
    print(len(texts))  # 1.8m

111866


In [9]:
with (DATASETS / 'womenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
    print(len(texts))

13025440


In [10]:
with (DATASETS / 'dataset.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')    
    for line in negatives:
        csvw.writerow([0, line])   

del negatives

In [17]:
positives = []
with (DATASETS / 'youdo.txt').open() as f:
    csvr = csv.reader(f, delimiter=',')
    for row in csvr:
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in row[1].strip().splitlines() if l)
        positives.append(t)

In [13]:
with (DATASETS / 'dataset.tsv').open('a') as f:
    csvw = csv.writer(f, delimiter='\t')  
    for line in positives:
        csvw.writerow([1, line])      
        
del positives

In [4]:
data = []
with (DATASETS / 'dataset.tsv').open() as f:
    csvr = csv.reader(f, delimiter='\t')
    for label, text in csvr:
        data.append((label, text))
        
random.shuffle(data)

In [31]:
# with (DATASETS / 'dataset.txt').open('w') as f:
#     for label, text in data:
#         print(text, file=f)

In [33]:
def to_imperative(word):
    try:
        p = [pm for pm in morph.parse(word) if 'INFN' in pm.tag][0]
    except IndexError as ex:
        return
    try:
        sing = p.inflect({'VERB', 'perf', 'impr', 'excl', 'plur'}).word
        plur = p.inflect({'VERB', 'perf', 'impr', 'excl'}).word
        return (sing, plur)
    except AttributeError as ex:
        return
    
assert to_imperative('Совершить') == ('совершите', 'соверши')
    
words_of_need = {'необходимо', 'нужно'}

def get_imperative_variants(text):
    try:
        words = []
        isinf = []
        for tok in mystem_analyze(text.lower()):
            if 'analysis' in tok:
                if len(tok['analysis']) >= 1:
                    gram = tok['analysis'][0]['gr']
                    infinitive = 'инф' in gram and 'V' in gram
                    w = tok['text']
                    words.append(w)
                    isinf.append(infinitive)
        if max(isinf) == False:
            return text,
        if isinf[0]:
            variants = []
            for w, v in zip(words, isinf):
                if v:
                    imp = to_imperative(w)
                    variants.append(imp or (w, w))
                else:
                    variants.append((w, w))
            sing, plur = zip(*variants)
            return text, ' '.join(sing), ' '.join(plur)
        elif words_of_need.intersection(words):
            prev_word = ''
            variants = []
            used_imperative = False
            for w, v in zip(words, isinf):
                if v and prev_word in words_of_need:
                    imp = to_imperative(w)
                    if imp:
                        variants.pop()
                        variants.append(imp)
                        used_imperative = True
                    else:
                        variants.append((w, w))

                else:
                    variants.append((w, w))
                prev_word = w

            if used_imperative:
                sing, plur = zip(*variants)
                return text, ' '.join(sing), ' '.join(plur)
    except ValueError as ex:
        pass
    return text,
        
assert get_imperative_variants('нужно сделать хорошо')[1] == 'сделайте хорошо'
assert get_imperative_variants('повертеть попой')[2] == 'поверти попой'
assert len(get_imperative_variants('хорошо сделать')) == 1

In [None]:
train_n = int(len(data) * 0.8)
count = 0
with (DATASETS / 'train2.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')
    for label, text in data[:train_n]:
        count += 1
        if count % 20000 == 0:
            print(count)
        if label == '1':
            for augmented_text in get_imperative_variants(text):
                csvw.writerow([label, augmented_text])
        else:
            csvw.writerow([label, augmented_text])

# with (DATASETS / 'test.tsv').open('w') as f:
#     csvw = csv.writer(f, delimiter='\t')
#     for label, text in data[train_n:]:
#         csvw.writerow([label, text])



20000
40000
60000
80000
100000
120000
140000
160000
180000
200000
220000
240000
260000
280000
300000
320000
340000
360000
380000
400000
420000
440000
460000
480000
500000
520000
540000
560000
580000
600000
620000
640000
660000
680000
700000
720000
740000
760000
780000
800000
820000
840000
860000
880000
900000
920000
940000
960000
980000
1000000
1020000
1040000
1060000
1080000
1100000
1120000
1140000
1160000
1180000
1200000
1220000
1240000
1260000
1280000
1300000
1320000
1340000
1360000
1380000
1400000
1420000
1440000
1460000
1480000
1500000
1520000
1540000
1560000
1580000
1600000
1620000
1640000
1660000
1680000
1700000
1720000
1740000
1760000
1780000
1800000
1820000


In [28]:
text

'Есть ложь , есть большая ложь , есть статистика , а есть реклама .'

In [5]:
train_X = []
train_y = []
with open('train.tsv') as f:
    csvr = csv.reader(f, delimiter='\t')
    for label, text in csvr:
        train_X.append(text)
        train_y.append(int(label))
        
train_y = np.array(train_y)

In [7]:
tfidf_path = MODELS / 'tfidf'
if tfidf_path.exists():
    with tfidf_path.open('rb') as f:
        tfidf = pickle.load(f)
else:
    tfidf = TfidfVectorizer()
    train_X = tfidf.fit(train_X)
    with tfidf_path.open('wb') as f:
        pickle.dump(tfidf, f)

train_X = tfidf.transform(train_X)

In [9]:
%%time

m = LinearSVC()
m.fit(train_X, train_y)

CPU times: user 2min 41s, sys: 22.8 s, total: 3min 3s
Wall time: 3min 3s


In [11]:
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', m)])
with (MODELS / 'classifier.pickle').open('wb') as f:
    pickle.dump(pipe, f)

In [13]:
test_X = []
test_y = []
with (DATASETS / 'test.tsv').open() as f:
    csvr = csv.reader(f, delimiter=',')
    csv.read
    for label, text in csvr:
        test_X.append(text)
        test_y.append(int(label))
        
test_y = np.array(test_y)
test_X = tfidf.transform(test_X)

ValueError: invalid literal for int() with base 10: 'istask'

In [None]:
print('Accuracy  ', (m.predict(test_X) == test_y).mean(), (m.predict(train_X) == train_y).mean(), sep='\t')
test_pos = test_y == 1
train_pos = train_y == 1
print('Pos Accuracy', (m.predict(train_X[train_pos]) == train_y[train_pos]).mean(), (m.predict(test_X[test_pos]) == test_y[test_pos]).mean(), sep='\t')
print('Neg Accuracy', (m.predict(train_X[~train_pos]) == train_y[~train_pos]).mean(), (m.predict(test_X[~test_pos]) == test_y[~test_pos]).mean(), sep='\t')

# 0.1


# 1
#   Accuracy	0.9966766041670294	0.9979463617801813
# Pos Accuracy	0.974006711709939	0.9588633251535406
# Neg Accuracy	0.9993740346497515	0.9989338185724046

# 10
#   Accuracy  	0.9963909735848226	0.9986286294275131
# Pos Accuracy	0.9831874787121008	0.9592863243807536
# Neg Accuracy	0.9995494829858005	0.9986058872573433

In [33]:
model_wrapper = FT_wrapper.train('/home/marat/fastText-0.1.0/fasttext', corpus_file=str(DATASETS / 'dataset.txt'))

In [36]:
model_wrapper.save('/home/marat/data/taskdialog/models/fasttext.bin')

In [18]:
model_wrapper.most_similar('=(')

[('Юлус', 0.93408203125),
 ('=(,', 0.8703312277793884),
 ('=("', 0.869441032409668),
 (';(', 0.8526694178581238),
 (':(:(:("', 0.8471227288246155),
 (':(', 0.8459991216659546),
 ('дальше:(', 0.8446086049079895),
 (':("', 0.8394126296043396),
 ('((', 0.838272213935852),
 (':(:(:(', 0.8340241312980652)]

In [47]:
train_n = int(len(data) * 0.8)

with (DATASETS / 'train.csv').open('w') as f:
    csvw = csv.writer(f)
    csvw.writerow(['istask', 'request'])
    for label, text in data[:train_n]:
        csvw.writerow([label, text])

with (DATASETS / 'test.csv').open('w') as f:
    csvw = csv.writer(f)
    csvw.writerow(['istask', 'request'])
    for label, text in data[train_n:]:
        csvw.writerow([label, text])

In [46]:
!nvidia-smi

Tue Feb 13 01:23:16 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.12                 Driver Version: 390.12                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   28C    P0    42W / 300W |      2MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    42W / 300W |   2011MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla P100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   