In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from pymystem3 import Mystem
from pymorphy2 import MorphAnalyzer
from string import punctuation, digits

In [2]:
%%time

names = ['filename', 'title', 'year', 'author', 'years_of_life',
         'time_summary', 'time_book', 'name', 'username', 'tradition_country']
df = pd.read_csv('metatable.tsv', sep='\t', names=names)

CPU times: user 15 ms, sys: 21 ms, total: 36 ms
Wall time: 58.3 ms


In [3]:
%%time

split_tradition_country = df['tradition_country'].str.split('->')
df['tradition']= split_tradition_country.str.get(0).apply(lambda x: x.replace('\xad', ''))
df['country']=split_tradition_country.str.get(1)

CPU times: user 11.9 ms, sys: 22 µs, total: 12 ms
Wall time: 11.9 ms


In [4]:
%%time

df['text'] = ''
for i in df.index:
    with open(df.loc[i, 'filename'], 'r', encoding='utf-8') as f:
        df.loc[i, 'text'] = f.read().replace('\xa0', ' ').replace('\n', ' ')

CPU times: user 2.86 s, sys: 728 ms, total: 3.59 s
Wall time: 5.11 s


In [5]:
STOPWORDS = nltk.corpus.stopwords.words('russian')

punctuation = set(punctuation + '«»—–…“”\n\t' + digits)
TABLE = str.maketrans({ch: ' ' for ch in punctuation})

In [6]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [7]:
df['text_tokenized'] = df.apply(lambda r: tokenize_text(r['text']), axis=1).values

In [8]:
pymorphy = MorphAnalyzer()

def pymorphy_lemmatize_text(pymorphy, text_tokenized):
    lemmas = []
    lemmas_pos = []
    for word in text_tokenized:
        ana = pymorphy.parse(word.translate(TABLE))[0]
        
        if ana.normal_form not in STOPWORDS:
            if ana.normal_form and ana.tag.POS:
                lemmas_pos.append(ana.normal_form + '_' + ana.tag.POS)
            lemmas.append(ana.normal_form)
    
    return lemmas, lemmas_pos

In [9]:
# %%time

# res = df.apply(lambda r: pymorphy_lemmatize_text(pymorphy, r['text_tokenized']), axis=1).values

In [10]:
# %%time

# df['text_pymorphy_list'] = np.nan
# df['text_pymorphy_pos_list'] = np.nan

# for i, elem in enumerate(res):
#     df['text_pymorphy_list'][i] = elem[0]
#     df['text_pymorphy_pos_list'][i] = elem[1]

In [11]:
# df['text_pymorphy'] = df.apply(lambda r: ' '.join(r['text_pymorphy_list']), axis=1).values
# df['text_pymorphy_pos'] = df.apply(lambda r: ' '.join(r['text_pymorphy_pos_list']), axis=1).values

In [12]:
mapping = {'COM': 'ADJ', 'APRO': 'DET', 'PART': 'PART', 'PR': 'ADP', 'ADV': 'ADV', 'INTJ': 'INTJ',
           'S': 'NOUN', 'V': 'VERB', 'CONJ': 'SCONJ', 'UNKN': 'X', 'ANUM': 'ADJ', 'NUM': 'NUM',
           'NONLEX': 'X', 'SPRO': 'PRON', 'ADVPRO': 'ADV', 'A': 'ADJ'}
pymystem = Mystem()

def pymystem_lemmatize_text(pymystem, text, mapping, pos=False):
    if pos:
        lemmas = []
        ana = pymystem.analyze(text.translate(TABLE))
        for word in ana:
            if len(word['analysis']) > 0:
                lemma = word['analysis'][0]['lex'].lower().strip()
                if lemma not in STOPWORDS:
                    pos = word['analysis'][0]['gr'].split(',')[0]
                    pos = pos.split('=')[0].strip()
                    if pos in mapping:
                        lemmas.append(lemma + '_' + mapping[pos]) # здесь мы конвертируем тэги
                    else:
                        lemmas.append(lemma + '_X') # на случай, если попадется тэг, которого нет в маппинге
    else:
        text = re.sub(' +', ' ', text.translate(TABLE))
        lemmas = pymystem.lemmatize(text)
        lemmas = list(filter(lambda a: a != ' ' and a != ' \n', lemmas))
    return lemmas

In [13]:
%%time

df['text_pymystem_list'] = df.apply(lambda r: pymystem_lemmatize_text(pymystem,
                                                                 r['text'], mapping,
                                                                 pos=False), axis=1).values

CPU times: user 24.6 s, sys: 718 ms, total: 25.3 s
Wall time: 2min 50s


In [14]:
df['text_pymystem'] = df.apply(lambda r: ' '.join(r['text_pymystem_list']), axis=1).values

In [15]:
# пока не используется из-за ошибки 'Cannot allocate memory'

# df['text_pymystem_pos'] = df.apply(lambda r: pymystem_lemmatize_text(pymystem,
#                                                                  r['text'].translate(table), mapping,
#                                                                  pos=True), axis=1)

In [17]:
%%time

df.to_pickle('metatable_preprocessed.pkl')

CPU times: user 2.36 s, sys: 1.99 s, total: 4.35 s
Wall time: 13 s
