In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import json
from os.path import join
from collections import defaultdict
import pickle
import pymorphy2

In [2]:
input_folder_path = 'csv_dialogs'
output_folder_path = 'csv_word_tokens'
group_id = 145254340

In [3]:
def preprocessor(text):
#     print type(text) == type(str(text))
    
    text = str(text)
    text = re.split(r'Собеседник: ?\n?', text, maxsplit = 1)[-1]
    text = text.replace(r'Собеседник отправил стикер','')
    text = text.replace(r'Бот: Мы заботимся о вашей анонимности и не позволяем пересылать сообщения','')
    text = text.replace(r'Чтобы получить самый интересный опрос, напиши "опрос" или "!"\n? ?Чтобы начать анонимный чат, напиши "чат"\n?','')
    return text

In [8]:
df_dialogs = pd.read_csv(join(input_folder_path, '{}.csv'.format(group_id)))

In [10]:
df_incomes = df_dialogs.query('is_income == 1').iloc[:,:]
df_incomes.loc[:,'text'] = df_incomes['text'].apply(preprocessor)
df_incomes.loc[:,'text'] = df_incomes.loc[:,'text'].str.decode('utf-8').str.lower()
df_incomes.head()

Unnamed: 0,user_id,dialog_number,message_number,text,is_income
0,402472964,0,0,приветствую,1
1,402472964,0,1,здрастееэээ?,1
2,402472964,0,2,эм...,1
3,402472964,0,3,мммм...,1
4,402472964,0,4,ты на связи?,1


In [11]:
df_incomes_filtered = df_incomes.query('text != ""')

In [12]:
df_incomes_filtered.shape

(352005, 5)

In [13]:
df_indexed2 = df_incomes_filtered.set_index(['user_id', 'dialog_number'])

In [14]:
score_good = pd.read_csv('csv_score/good_dialogs.csv')
score_good['score'] = 1


In [15]:
score_bad = pd.read_csv('csv_score/bad_dialogs.csv')
score_bad['score'] = -1


In [16]:
score = pd.concat((score_good, score_bad))
score.head()

Unnamed: 0,user_id,dialog_number,score
0,386898798,71,1
1,377635143,48,1
2,228500854,23,1
3,229537964,12,1
4,237511060,101,1


In [17]:
score_indexed = score.set_index(['user_id', 'dialog_number'])
score_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score
user_id,dialog_number,Unnamed: 2_level_1
386898798,71,1
377635143,48,1
228500854,23,1
229537964,12,1
237511060,101,1


In [18]:
df_scored_dialogs = df_indexed2.join(score_indexed).dropna().reset_index().drop(columns=['is_income'])

In [19]:
df_scored_dialogs.to_csv('csv_scored_dialogs/{}.csv'.format(group_id), encoding='utf-8', index=False)

In [20]:
df_scored_dialogs.shape

(237800, 5)

In [21]:
df_scored_dialogs.head()

Unnamed: 0,user_id,dialog_number,message_number,text,score
0,2486613,7,1,тян?,-1.0
1,2486613,10,1,хуел,-1.0
2,2486613,11,1,привет,1.0
3,2486613,11,5,😂,1.0
4,2486613,11,6,м ж?,1.0


In [144]:
# def text_normalizer(text, ob_norm):
#     text = ob_norm.sub_pattern.sub(r"\1\1", text)
#     words = nltk.word_tokenize(text)
# #     words = TweetTokenizer(reduce_len=True).tokenize(text)
#     return ' '.join(words)

In [38]:
class Normalizer:
    def __init__(self, morph):
        self.morph = morph
        self.sub_pattern = re.compile(r"(.)\1{2,}")
        
        self.alphabet = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
        
        self.abc_pattern = re.compile(ur'[{}-]+'.format(self.alphabet))
#         pattern_word_ru = "["+ alphabet_ru + alphabet_ru.upper() + extra_word_elems + "]+"
        
        self.word_stat = dict()
    
        stored_dict = pickle.load( open( "extra/dictionary.p", "rb" ) )
        self.vocab = defaultdict(lambda: 1)
        for key, value in stored_dict.iteritems():
            self.vocab[key] += value
            
    # функция, возвращающая все слова, которые находятся на расстоянии одной правки от исходного слова
    def edits1(self, word):
       splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
       deletes    = [a + b[1:] for a, b in splits if b]
       transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
       replaces   = [a + c + b[1:] for a, b in splits for c in self.alphabet if b]
       inserts    = [a + c + b     for a, b in splits for c in self.alphabet]
       return list(set(deletes + transposes + replaces + inserts))

#     # на расстоянии 2 правок
#     def edits2(self, word):
#         return list(set(e2 for e1 in self.edits1(word) for e2 in self.edits1(e1)))

    def check_in_vocab(self, words):
        if isinstance(words, unicode):
            words = [words]
        return filter(lambda x: x in self.vocab,words)

    def correct(self, word):
        if self.abc_pattern.match(word) is None:
            return word
#         candidates = self.check_in_vocab([word]) or self.check_in_vocab(self.edits1(word)) or self.check_in_vocab(self.edits2(word)) or [word]
        candidates = self.check_in_vocab([word]) or self.check_in_vocab(self.edits1(word)) or [word]
        return max(candidates, key=self.vocab.get)
    
    def normal_forms(self, words):
        result = []
        for word in words:
            possible_words = morph.parse(word)
            possible_normal_forms = []
            for possible_word in possible_words:
                if possible_word.score >= 0.5:
                    possible_normal_forms.append(possible_word.normal_form)
            
            to_append = word if possible_normal_forms == [] else possible_normal_forms[0]
            if to_append in self.word_stat:
                self.word_stat[to_append] += 1
            else:
                self.word_stat[to_append] = 1
            result.append(to_append)
#             if possible_normal_forms == []:
#                 result.append(word)
#             else:
#                 to_append = word
#                 result.append(possible_normal_forms[0])
                    
            for normal_form in set(possible_normal_forms):
                if normal_form in self.word_stat:
                    self.word_stat[normal_form] += 1
                else:
                    self.word_stat[normal_form] = 1
        
        return result
    
    def text_normalizer(self, text):
        text = self.sub_pattern.sub(r"\1\1", text)
        words = nltk.word_tokenize(text)
        words = [self.correct(word) for word in words]
        words = self.normal_forms(words)
    #     words = TweetTokenizer(reduce_len=True).tokenize(text)
        return ' '.join(words)

In [23]:
morph = pymorphy2.MorphAnalyzer()

In [39]:
%time normalizer = Normalizer(morph)

CPU times: user 1.09 s, sys: 112 ms, total: 1.2 s
Wall time: 2.05 s


In [40]:
%time tokens = df_scored_dialogs.loc[:,'text'].apply(normalizer.text_normalizer)

CPU times: user 7min 49s, sys: 1.07 s, total: 7min 50s
Wall time: 7min 52s


In [42]:
with open('stats/words.json', 'w') as f:
    json.dump(normalizer.word_stat, f)

In [43]:
df_scored_dialogs['tokens'] = tokens

In [44]:
df_scored_dialogs.to_csv('csv_scored_dialogs/{}.csv'.format(group_id), encoding='utf-8', index=False)