In [11]:
import re
import json
import os
import csv
import sys
from multiprocessing import Pool
from nltk.tokenize import wordpunct_tokenize
import numpy as np
import random
from gensim.models.wrappers.fasttext import FastText as FT_wrapper

import pickle
from pathlib import Path
from pymystem3 import Mystem
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
m = Mystem()

def mystem_analyze(str):
    global m
    try:
        return m.analyze(str)
    except BrokenPipeError as ex:
        m = Mystem()
        return mystem_analyze(str)
    

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from itertools import product

csv.field_size_limit(sys.maxsize)

DATASETS = Path('~/data/taskdialog').expanduser()
MODELS = Path('~/data/taskdialog/models').expanduser()

In [2]:
positives = []
with (DATASETS / 'youdo.txt').open() as f:
    csvr = csv.reader(f, delimiter=',')
    for row in csvr:
        t = '\n'.join(' '.join(wordpunct_tokenize(l)) for l in row[1].strip().splitlines() if l)
        positives.append(t)

# with (DATASETS / 'test.tsv').open() as f:
#     c = csv.reader(f, delimiter='\t')
#     for label, text in c:
#         pass

In [5]:
def to_imperative(word):
    try:
        p = [pm for pm in morph.parse(word) if 'INFN' in pm.tag][0]
    except IndexError as ex:
        return
    try:
        sing = p.inflect({'VERB', 'perf', 'impr', 'excl', 'plur'}).word
        plur = p.inflect({'VERB', 'perf', 'impr', 'excl'}).word
        return (sing, plur)
    except AttributeError as ex:
        return
    
assert to_imperative('Совершить') == ('совершите', 'соверши')
    
words_of_need = {'необходимо', 'нужно'}

def get_imperative_variants(text):
    try:
        words = []
        isinf = []
        for tok in mystem_analyze(text.lower()):
            if 'analysis' in tok:
                if len(tok['analysis']) >= 1:
                    gram = tok['analysis'][0]['gr']
                    infinitive = 'инф' in gram and 'V' in gram
                    w = tok['text']
                    words.append(w)
                    isinf.append(infinitive)
        if max(isinf) == False:
            return text,
        if isinf[0]:
            variants = []
            for w, v in zip(words, isinf):
                if v:
                    imp = to_imperative(w)
                    variants.append(imp or (w, w))
                else:
                    variants.append((w, w))
            sing, plur = zip(*variants)
            return text, ' '.join(sing), ' '.join(plur)
        elif words_of_need.intersection(words):
            prev_word = ''
            variants = []
            used_imperative = False
            for w, v in zip(words, isinf):
                if v and prev_word in words_of_need:
                    imp = to_imperative(w)
                    if imp:
                        variants.pop()
                        variants.append(imp)
                        used_imperative = True
                    else:
                        variants.append((w, w))

                else:
                    variants.append((w, w))
                prev_word = w

            if used_imperative:
                sing, plur = zip(*variants)
                return text, ' '.join(sing), ' '.join(plur)
    except ValueError as ex:
        pass
    return text,
        
assert get_imperative_variants('нужно сделать хорошо')[1] == 'сделайте хорошо'
assert get_imperative_variants('повертеть попой')[2] == 'поверти попой'
assert len(get_imperative_variants('хорошо сделать')) == 1

In [22]:
%%time

with Pool(processes=20) as pool:
    res = pool.map(get_imperative_variants, positives[:20000])
        

CPU times: user 132 ms, sys: 498 ms, total: 630 ms
Wall time: 5.5 s


In [23]:
len(res)

20000

In [None]:
get_imperative_variants('Есть ложь , есть большая ложь , есть статистика , а есть реклама .')

In [106]:
morph.parse('запостить')

[Parse(word='запостить', tag=OpencorporaTag('UNKN'), normal_form='запостить', score=1.0, methods_stack=((<UnknAnalyzer>, 'ить'), (<KnownPrefixAnalyzer>, 'пост'), (<KnownPrefixAnalyzer>, 'за')))]

In [24]:
a = '''Занимаюсь созданием сайта об ИКЕА .
Требуются копирайтеры для наполнения сайта об ИКЕА .
Статьи о категориях и товарах , 200р / 1000сим
Работы много , общее число страниц более 200 .
Объем 2000 на страницу .
Проверяю и оплачиваю быстро и регулярно .
За хорошую работу оставлю развернутый положительный отзыв , помещу в белый список .
Предложу работу в дальнейшем по другим проектам с возможностью повышения оплаты .'''

In [25]:
train_text = []
train_y = []
train_features = []
with (DATASETS / 'train4.tsv').open() as f:
    c = csv.reader(f, delimiter='\t')
    for label, body in c:
        body = body[:200]
        if ' ' not in body:
            continue
        body = body[:body.rindex(' ')]
#         train_features.append(features(body))
        train_text.append(body)
        train_y.append(int(label))

test_text = []
test_y = []
test_features = []
with (DATASETS / 'test4.tsv').open() as f:
    c = csv.reader(f, delimiter='\t')
    for label, body in c:
        body = body[:200]
        if ' ' not in body:
            continue
        body = body[:body.rindex(' ')]
#         test_features.append(features(body))
        test_text.append(body)
        test_y.append(int(label))

415

In [None]:
stopwords = '''форумчане
форумчане
simplexman
saitov
дрома
drom
quote
дром
професcиональный
рекомендую
советую
взаимосвязями
тебе
загляни
youdo
фрилансеры'''.split('\n')

In [None]:
%%time
tfidf_path = MODELS / 'tfidf5.pickle'
if tfidf_path.exists():
    with tfidf_path.open('rb') as f:
        tfidf = pickle.load(f)
else:
    tfidf = TfidfVectorizer(min_df=5, stop_words=stopwords)
    tfidf.fit(train_text)
    with tfidf_path.open('wb') as f:
        pickle.dump(tfidf, f)

train_X = tfidf.transform(train_text)
test_X = tfidf.transform(test_text)

In [None]:
%%time

classifier = LinearSVC()
classifier.fit(train_X, train_y)

In [None]:
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])
with (MODELS / 'classifier5.pickle').open('wb') as f:
    pickle.dump(pipe, f)

In [None]:
print('Accuracy  ', (classifier.predict(test_X) == test_y).mean(), (classifier.predict(train_X) == train_y).mean(), sep='\t')
test_pos = test_y == 1
train_pos = train_y == 1
print('Pos Accuracy', (classifier.predict(train_X[train_pos]) == train_y[train_pos]).mean(), (classifier.predict(test_X[test_pos]) == test_y[test_pos]).mean(), sep='\t')
print('Neg Accuracy', (classifier.predict(train_X[~train_pos]) == train_y[~train_pos]).mean(), (classifier.predict(test_X[~test_pos]) == test_y[~test_pos]).mean(), sep='\t')


In [None]:
id2words = {i: w for w, i in tfidf.vocabulary_.items()}
words = []
for i in range(classifier.coef_.shape[1]):
    words.append((id2words[i], classifier.coef_[0][i]))

In [None]:
importances = sorted(words, key=lambda x: x[1], reverse=True)

importances[:1000]