In [1]:
import re
import json
import os
import csv
import sys

from nltk.tokenize import wordpunct_tokenize
import numpy as np
import random
from gensim.models.wrappers.fasttext import FastText as FT_wrapper

import pickle
from pathlib import Path

from pymystem3 import Mystem
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
m = Mystem()


def mystem_analyze(str):
    global m
    try:
        return m.analyze(str)
    except BrokenPipeError as ex:
        m = Mystem()
        return mystem_analyze(str)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

csv.field_size_limit(sys.maxsize)

DATASETS = Path('~/data/taskdialog').expanduser()
MODELS = Path('~/data/taskdialog/models').expanduser()

In [2]:
negatives = []

In [3]:
def preprocess_forum(f):
    lines = []
    for line in f:        
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in json.loads(line)['text'].strip().splitlines() if l.strip())
        if t:
            lines.append(t)
    return lines
    

In [5]:
with (DATASETS / 'forummoskva.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)
    print(len(texts))

1238816


In [6]:
with (DATASETS / 'dromru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)
    print(len(texts))    

15722973


In [7]:
with (DATASETS / 'vashdomru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
    print(len(texts))  # 300k

65593


In [8]:
with (DATASETS / 'antiwomenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
    print(len(texts))  # 1.8m

111866


In [9]:
with (DATASETS / 'womenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
    print(len(texts))

13025440


In [10]:
with (DATASETS / 'dataset.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')    
    for line in negatives:
        csvw.writerow([0, line])   

del negatives

In [17]:
positives = []
with (DATASETS / 'youdo.txt').open() as f:
    csvr = csv.reader(f, delimiter=',')
    for row in csvr:
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in row[1].strip().splitlines() if l)
        positives.append(t)

In [13]:
with (DATASETS / 'dataset.tsv').open('a') as f:
    csvw = csv.writer(f, delimiter='\t')  
    for line in positives:
        csvw.writerow([1, line])      
        
del positives

In [2]:
data = []
with (DATASETS / 'dataset.tsv').open() as f:
    csvr = csv.reader(f, delimiter='\t')
    for label, text in csvr:
        data.append((label, text))
        
random.shuffle(data)

In [31]:
# with (DATASETS / 'dataset.txt').open('w') as f:
#     for label, text in data:
#         print(text, file=f)

In [5]:
def to_imperative(word):
    try:
        p = [pm for pm in morph.parse(word) if 'INFN' in pm.tag][0]
    except IndexError as ex:
        return
    try:
        sing = p.inflect({'VERB', 'perf', 'impr', 'excl', 'plur'}).word
        plur = p.inflect({'VERB', 'perf', 'impr', 'excl'}).word
        return (sing, plur)
    except AttributeError as ex:
        return
    
assert to_imperative('Совершить') == ('совершите', 'соверши')
    
words_of_need = {'необходимо', 'нужно'}

def get_imperative_variants(text):
    try:
        words = []
        isinf = []
        for tok in mystem_analyze(text.lower()):
            if 'analysis' in tok:
                if len(tok['analysis']) >= 1:
                    gram = tok['analysis'][0]['gr']
                    infinitive = 'инф' in gram and 'V' in gram
                    w = tok['text']
                    words.append(w)
                    isinf.append(infinitive)
        if max(isinf) == False:
            return text,
        if isinf[0]:
            variants = []
            for w, v in zip(words, isinf):
                if v:
                    imp = to_imperative(w)
                    variants.append(imp or (w, w))
                else:
                    variants.append((w, w))
            sing, plur = zip(*variants)
            return text, ' '.join(sing), ' '.join(plur)
        elif words_of_need.intersection(words):
            prev_word = ''
            variants = []
            used_imperative = False
            for w, v in zip(words, isinf):
                if v and prev_word in words_of_need:
                    imp = to_imperative(w)
                    if imp:
                        variants.pop()
                        variants.append(imp)
                        used_imperative = True
                    else:
                        variants.append((w, w))

                else:
                    variants.append((w, w))
                prev_word = w

            if used_imperative:
                sing, plur = zip(*variants)
                return text, ' '.join(sing), ' '.join(plur)
    except ValueError as ex:
        pass
    return text,
        
assert get_imperative_variants('нужно сделать хорошо')[1] == 'сделайте хорошо'
assert get_imperative_variants('повертеть попой')[2] == 'поверти попой'
assert len(get_imperative_variants('хорошо сделать')) == 1

In [6]:
train_n = int(len(data) * 0.8)
count = 0
with (DATASETS / 'train2.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')
    for label, text in data[:train_n]:
        count += 1
        if count % 20000 == 0:
            print('{:.2f}%'.format(count/train_n*100))
        if label == '1':
            for augmented_text in get_imperative_variants(text):
                csvw.writerow([label, augmented_text])
        else:
            csvw.writerow([label, text])

with (DATASETS / 'test.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')
    for label, text in data[train_n:]:
        csvw.writerow([label, text])



0.08%
0.16%
0.23%
0.31%
0.39%
0.47%
0.55%
0.62%
0.70%
0.78%
0.86%
0.94%
1.02%
1.09%
1.17%
1.25%
1.33%
1.41%
1.48%
1.56%
1.64%
1.72%
1.80%
1.87%
1.95%
2.03%
2.11%
2.19%
2.27%
2.34%
2.42%
2.50%
2.58%
2.66%
2.73%
2.81%
2.89%
2.97%
3.05%
3.12%
3.20%
3.28%
3.36%
3.44%
3.51%
3.59%
3.67%
3.75%
3.83%
3.91%
3.98%
4.06%
4.14%
4.22%
4.30%
4.37%
4.45%
4.53%
4.61%
4.69%
4.76%
4.84%
4.92%
5.00%
5.08%
5.16%
5.23%
5.31%
5.39%
5.47%
5.55%
5.62%
5.70%
5.78%
5.86%
5.94%
6.01%
6.09%
6.17%
6.25%
6.33%
6.40%
6.48%
6.56%
6.64%
6.72%
6.80%
6.87%
6.95%
7.03%
7.11%
7.19%
7.26%
7.34%
7.42%
7.50%
7.58%
7.65%
7.73%
7.81%
7.89%
7.97%
8.05%
8.12%
8.20%
8.28%
8.36%
8.44%
8.51%
8.59%
8.67%
8.75%
8.83%
8.90%
8.98%
9.06%
9.14%
9.22%
9.29%
9.37%
9.45%
9.53%
9.61%
9.69%
9.76%
9.84%
9.92%
10.00%
10.08%
10.15%
10.23%
10.31%
10.39%
10.47%
10.54%
10.62%
10.70%
10.78%
10.86%
10.93%
11.01%
11.09%
11.17%
11.25%
11.33%
11.40%
11.48%
11.56%
11.64%
11.72%
11.79%
11.87%
11.95%
12.03%
12.11%
12.18%
12.26%
12.34%
12.42%
12.50%
12.58%


92.95%
93.03%
93.10%
93.18%
93.26%
93.34%
93.42%
93.49%
93.57%
93.65%
93.73%
93.81%
93.88%
93.96%
94.04%
94.12%
94.20%
94.28%
94.35%
94.43%
94.51%
94.59%
94.67%
94.74%
94.82%
94.90%
94.98%
95.06%
95.13%
95.21%
95.29%
95.37%
95.45%
95.53%
95.60%
95.68%
95.76%
95.84%
95.92%
95.99%
96.07%
96.15%
96.23%
96.31%
96.38%
96.46%
96.54%
96.62%
96.70%
96.77%
96.85%
96.93%
97.01%
97.09%
97.17%
97.24%
97.32%
97.40%
97.48%
97.56%
97.63%
97.71%
97.79%
97.87%
97.95%
98.02%
98.10%
98.18%
98.26%
98.34%
98.41%
98.49%
98.57%
98.65%
98.73%
98.81%
98.88%
98.96%
99.04%
99.12%
99.20%
99.27%
99.35%
99.43%
99.51%
99.59%
99.66%
99.74%
99.82%
99.90%
99.98%


In [12]:
train_X = []
train_y = []
with (DATASETS / 'train2.tsv').open() as f:
    csvr = csv.reader(f, delimiter='\t')
    for label, text in csvr:
        train_X.append(text)
        train_y.append(int(label))
        
train_y = np.array(train_y)

In [None]:
tfidf_path = MODELS / 'tfidf2'
if tfidf_path.exists():
    with tfidf_path.open('rb') as f:
        tfidf = pickle.load(f)
else:
    tfidf = TfidfVectorizer()
    tfidf.fit(train_X)
    with tfidf_path.open('wb') as f:
        pickle.dump(tfidf, f)

train_X = tfidf.transform(train_X)

In [None]:
%%time

m = LinearSVC()
m.fit(train_X, train_y)

In [None]:
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', m)])
with (MODELS / 'classifier2.pickle').open('wb') as f:
    pickle.dump(pipe, f)

In [None]:
test_X = []
test_y = []
with (DATASETS / 'test.tsv').open() as f:
    csvr = csv.reader(f, delimiter=',')
    csv.read
    for label, text in csvr:
        test_X.append(text)
        test_y.append(int(label))
        
test_y = np.array(test_y)
test_X = tfidf.transform(test_X)

In [None]:
print('Accuracy  ', (m.predict(test_X) == test_y).mean(), (m.predict(train_X) == train_y).mean(), sep='\t')
test_pos = test_y == 1
train_pos = train_y == 1
print('Pos Accuracy', (m.predict(train_X[train_pos]) == train_y[train_pos]).mean(), (m.predict(test_X[test_pos]) == test_y[test_pos]).mean(), sep='\t')
print('Neg Accuracy', (m.predict(train_X[~train_pos]) == train_y[~train_pos]).mean(), (m.predict(test_X[~test_pos]) == test_y[~test_pos]).mean(), sep='\t')

# 0.1


# 1
#   Accuracy	0.9966766041670294	0.9979463617801813
# Pos Accuracy	0.974006711709939	0.9588633251535406
# Neg Accuracy	0.9993740346497515	0.9989338185724046

# 10
#   Accuracy  	0.9963909735848226	0.9986286294275131
# Pos Accuracy	0.9831874787121008	0.9592863243807536
# Neg Accuracy	0.9995494829858005	0.9986058872573433

In [33]:
model_wrapper = FT_wrapper.train('/home/marat/fastText-0.1.0/fasttext', corpus_file=str(DATASETS / 'dataset.txt'))

In [36]:
model_wrapper.save('/home/marat/data/taskdialog/models/fasttext.bin')

In [18]:
model_wrapper.most_similar('=(')

[('Юлус', 0.93408203125),
 ('=(,', 0.8703312277793884),
 ('=("', 0.869441032409668),
 (';(', 0.8526694178581238),
 (':(:(:("', 0.8471227288246155),
 (':(', 0.8459991216659546),
 ('дальше:(', 0.8446086049079895),
 (':("', 0.8394126296043396),
 ('((', 0.838272213935852),
 (':(:(:(', 0.8340241312980652)]

In [47]:
train_n = int(len(data) * 0.8)

with (DATASETS / 'train.csv').open('w') as f:
    csvw = csv.writer(f)
    csvw.writerow(['istask', 'request'])
    for label, text in data[:train_n]:
        csvw.writerow([label, text])

with (DATASETS / 'test.csv').open('w') as f:
    csvw = csv.writer(f)
    csvw.writerow(['istask', 'request'])
    for label, text in data[train_n:]:
        csvw.writerow([label, text])

In [46]:
!nvidia-smi

Tue Feb 13 01:23:16 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.12                 Driver Version: 390.12                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   28C    P0    42W / 300W |      2MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    42W / 300W |   2011MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla P100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   