In [1]:
import re
import json
import os
import csv
import sys

from multiprocessing import Pool

from nltk.tokenize import wordpunct_tokenize
import numpy as np
import random
from gensim.models.wrappers.fasttext import FastText as FT_wrapper

import pickle
from pathlib import Path

from pymystem3 import Mystem
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
m = Mystem()


def mystem_analyze(str):
    global m
    try:
        return m.analyze(str)
    except BrokenPipeError as ex:
        m = Mystem()
        return mystem_analyze(str)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

csv.field_size_limit(sys.maxsize)

DATASETS = Path('~/data/taskdialog').expanduser()
MODELS = Path('~/data/taskdialog/models').expanduser()

In [2]:
negatives = []

In [3]:
def preprocess_forum(f):
    try:
        i = 0
        for line in f:
            i += 1
            t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in json.loads(line)['text'].strip().splitlines() if l.strip())
            if t and len(t) < 4000:
                yield t
            if i % 1000000 == 0:
                print(i)
    except Exception as ex:
        print(ex)
        return


In [4]:
%%time
with (DATASETS / 'forummoskva.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)

1000000
CPU times: user 31.4 s, sys: 908 ms, total: 32.3 s
Wall time: 32.3 s


In [None]:
%%time

droms = []
with (DATASETS / 'dromru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    droms.extend(texts)
    

In [14]:
random.shuffle(droms)
del droms[10*1000*1000:]
negatives.extend(droms)
del droms

In [6]:
%%time
with (DATASETS / 'vashdomru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
#     print(len(texts))  # 300k

CPU times: user 5.28 s, sys: 360 ms, total: 5.64 s
Wall time: 5.64 s


In [7]:
%%time
with (DATASETS / 'antiwomenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
#     print(len(texts))  # 1.8m

CPU times: user 6.84 s, sys: 344 ms, total: 7.18 s
Wall time: 7.18 s


In [8]:
%%time
with (DATASETS / 'womenru.jsonl').open() as f:
    texts = preprocess_forum(f)    
    negatives.extend(texts)  
#     print(len(texts))

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
CPU times: user 7min 10s, sys: 16.3 s, total: 7min 27s
Wall time: 7min 26s


In [17]:
print(len(negatives))
random.shuffle(negatives)

24440886


In [19]:
def to_imperative(word):
    try:
        p = [pm for pm in morph.parse(word) if 'INFN' in pm.tag][0]
    except IndexError as ex:
        return
    try:
        sing = p.inflect({'VERB', 'perf', 'impr', 'excl', 'plur'}).word
        plur = p.inflect({'VERB', 'perf', 'impr', 'excl'}).word
        return (sing, plur)
    except AttributeError as ex:
        return
    
assert to_imperative('Совершить') == ('совершите', 'соверши')
    
words_of_need = {'необходимо', 'нужно', 'требуется'}

def get_imperative_variants(text):
    try:
        words = []
        isinf = []
        for tok in mystem_analyze(text.lower()):
            if 'analysis' in tok:
                if len(tok['analysis']) >= 1:
                    gram = tok['analysis'][0]['gr']
                    infinitive = 'инф' in gram and 'V' in gram
                    w = tok['text']
                    words.append(w)
                    isinf.append(infinitive)
        if max(isinf) == False:
            return text,
        if isinf[0]:
            variants = []
            for w, v in zip(words, isinf):
                if v:
                    imp = to_imperative(w)
                    variants.append(imp or (w, w))
                else:
                    variants.append((w, w))
            sing, plur = zip(*variants)
            return text, ' '.join(sing), ' '.join(plur)
        elif words_of_need.intersection(words):
            prev_word = ''
            variants = []
            used_imperative = False
            for w, v in zip(words, isinf):
                if v and prev_word in words_of_need:
                    imp = to_imperative(w)
                    if imp:
                        variants.pop()
                        variants.append(imp)
                        used_imperative = True
                    else:
                        variants.append((w, w))

                else:
                    variants.append((w, w))
                prev_word = w

            if used_imperative:
                sing, plur = zip(*variants)
                return text, ' '.join(sing), ' '.join(plur)
    except ValueError as ex:
        pass
    return text,
        
assert get_imperative_variants('нужно сделать хорошо')[1] == 'сделайте хорошо'
assert get_imperative_variants('повертеть попой')[2] == 'поверти попой'
assert len(get_imperative_variants('хорошо сделать')) == 1

In [None]:
%%time

positives = []
with (DATASETS / 'youdo.txt').open() as f:
    csvr = csv.reader(f, delimiter=',')
    for title, body in csvr:
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in body.strip().splitlines() if l)
        positives.append((title, t))
        
with (DATASETS / 'fl.csv').open() as f:
    c = csv.reader(f)
    positives.extend(c)  
    
random.shuffle(positives)

train_tasks_n = int(len(positives) * 0.8)

In [25]:
positives_train = [(title, body) for title, body in positives[:train_tasks_n] if title.strip() and body.strip()]
positives_test = [(title, body) for title, body in positives[train_tasks_n:] if title.strip() and body.strip()]

In [27]:
%%time

augmented_body = [get_imperative_variants(body) for title, body in positives_train]

KeyboardInterrupt: 

In [38]:
%%time
with Pool(processes=40) as pool:
    augmented_body = pool.map(get_imperative_variants, (body for title, body in positives_train[:]))

CPU times: user 21.4 s, sys: 53 s, total: 1min 14s
Wall time: 5min 48s


In [39]:
%%time

with (DATASETS / 'tasks_train.csv').open('w') as f:
    c = csv.writer(f)
    for (title, body), augs in zip(positives_train, augmented_body):
        for a in augs:
            c.writerow([title, a])

with (DATASETS / 'tasks_test.csv').open('w') as f:
    c = csv.writer(f)
    for title, body in positives_test:
        c.writerow([title, body])

CPU times: user 1min 9s, sys: 13 s, total: 1min 22s
Wall time: 1min 23s


In [69]:
augmented_body[2][0]

'Пропала кошка в Красногорске . Район Медучилище . Срочно нужно пару человек на поиски . Подробности по телефону . За нахождение отдельное вознаграждение . Поиск будет вестись в окрестностях указанного адреса .\nБолее подробно условия задания обсудим с исполнителем .\nВ предложениях указывайте сроки , когда сможете выполнить задание и цену за работу .\nЖду Ваших предложений !'

In [40]:
%%time

negatives_train_n = int(len(negatives) * 0.8)
train = [(0, body) for body in negatives[:negatives_train_n]] + [(1, body[0]) for body in augmented_body]
random.shuffle(train)

with (DATASETS / 'train4.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')
    for label, body in train:
        csvw.writerow([label, body])

        
test = [(0, body) for body in negatives[negatives_train_n:]] + [(1, body) for title, body in positives_test]
random.shuffle(test)
with (DATASETS / 'test4.tsv').open('w') as f:
    csvw = csv.writer(f, delimiter='\t')
    for label, body in test:
        csvw.writerow([label, body])


CPU times: user 3min 22s, sys: 29.1 s, total: 3min 51s
Wall time: 3min 51s


In [12]:
%%time
def features(s):
    for g in mystem_analyze(s):
        if 'analysis' in g:
            if len(g['analysis']) > 0:
                for gr in g['analysis']:
                    if 'пов' in gr['gr']:
                        return 1.0
    return 0.0

for i in range(10000):
    features('Вася , реализовать функционал . Петя напишет план а ты функционал')

CPU times: user 348 ms, sys: 75.9 ms, total: 424 ms
Wall time: 2.79 s


In [42]:
train_text = []
train_y = []
train_features = []
with (DATASETS / 'train4.tsv').open() as f:
    c = csv.reader(f, delimiter='\t')
    for label, body in c:
        body = body[:200]
        if ' ' not in body:
            continue
        body = body[:body.rindex(' ')]
#         train_features.append(features(body))
        train_text.append(body)
        train_y.append(int(label))

test_text = []
test_y = []
test_features = []
with (DATASETS / 'test4.tsv').open() as f:
    c = csv.reader(f, delimiter='\t')
    for label, body in c:
        body = body[:200]
        if ' ' not in body:
            continue
        body = body[:body.rindex(' ')]
#         test_features.append(features(body))
        test_text.append(body)
        test_y.append(int(label))

In [31]:
# with (DATASETS / 'dataset.txt').open('w') as f:
#     for label, text in data:
#         print(text, file=f)

In [None]:
# train_n = int(len(data) * 0.8)
# count = 0
# with (DATASETS / 'train2.tsv').open('w') as f:
#     csvw = csv.writer(f, delimiter='\t')
#     for label, text in data[:train_n]:
#         count += 1
#         if count % 20000 == 0:
#             print('{:.2f}%'.format(count/train_n*100))
#         if label == '1':
#             for augmented_text in get_imperative_variants(text):
#                 csvw.writerow([label, augmented_text])
#         else:
#             csvw.writerow([label, text])

# with (DATASETS / 'test.tsv').open('w') as f:
#     csvw = csv.writer(f, delimiter='\t')
#     for label, text in data[train_n:]:
#         csvw.writerow([label, text])



In [43]:
stopwords = '''форумчане
форумчане
simplexman
saitov
дрома
drom
quote
дром
професcиональный
рекомендую
советую
взаимосвязями
тебе
загляни
youdo
фрилансеры'''.split('\n')

In [44]:
%%time
tfidf_path = MODELS / 'tfidf5.pickle'
if tfidf_path.exists():
    with tfidf_path.open('rb') as f:
        tfidf = pickle.load(f)
else:
    tfidf = TfidfVectorizer(min_df=5, stop_words=stopwords)
    tfidf.fit(train_text)
    with tfidf_path.open('wb') as f:
        pickle.dump(tfidf, f)

train_X = tfidf.transform(train_text)
test_X = tfidf.transform(test_text)

CPU times: user 19min 3s, sys: 1min 4s, total: 20min 8s
Wall time: 20min 8s


In [45]:
%%time

classifier = LinearSVC()
classifier.fit(train_X, train_y)

CPU times: user 2min 2s, sys: 11.2 s, total: 2min 13s
Wall time: 2min 13s


In [46]:
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])
with (MODELS / 'classifier5.pickle').open('wb') as f:
    pickle.dump(pipe, f)

In [47]:
print('Accuracy  ', (classifier.predict(test_X) == test_y).mean(), (classifier.predict(train_X) == train_y).mean(), sep='\t')
test_pos = test_y == 1
train_pos = train_y == 1
print('Pos Accuracy', (classifier.predict(train_X[train_pos]) == train_y[train_pos]).mean(), (classifier.predict(test_X[test_pos]) == test_y[test_pos]).mean(), sep='\t')
print('Neg Accuracy', (classifier.predict(train_X[~train_pos]) == train_y[~train_pos]).mean(), (classifier.predict(test_X[~test_pos]) == test_y[~test_pos]).mean(), sep='\t')

# 1
#   Accuracy	0.9966766041670294	0.9979463617801813
# Pos Accuracy	0.974006711709939	0.9588633251535406
# Neg Accuracy	0.9993740346497515	0.9989338185724046

Accuracy  	0.9923632064836054	0.9942928958053494
Pos Accuracy	1.0	1.0
Neg Accuracy	1.0	1.0


In [114]:
id2words = {i: w for w, i in tfidf.vocabulary_.items()}
words = []
for i in range(classifier.coef_.shape[1]):
    words.append((id2words[i], classifier.coef_[0][i]))

In [117]:
importances = sorted(words, key=lambda x: x[1], reverse=True)

importances[:1000]

In [33]:
model_wrapper = FT_wrapper.train('/home/marat/fastText-0.1.0/fasttext', corpus_file=str(DATASETS / 'dataset.txt'))

In [36]:
model_wrapper.save('/home/marat/data/taskdialog/models/fasttext.bin')

In [48]:
model_wrapper.most_similar('youdo')

NameError: name 'model_wrapper' is not defined