In [1]:
import nltk
import re
import os
import pandas as pd
import numpy as np
from pymystem3 import Mystem
from string import punctuation, digits

In [2]:
%%time

names = ['filename', 'title', 'year', 'author', 'years_of_life',
         'time_summary', 'time_book', 'name', 'username', 'tradition_country']
df = pd.read_csv('metatable.tsv', sep='\t', names=names)

CPU times: user 30.1 ms, sys: 16.9 ms, total: 47 ms
Wall time: 100 ms


In [3]:
%%time

split_tradition_country = df['tradition_country'].str.split('->')
df['tradition']= split_tradition_country.str.get(0).apply(lambda x: x.replace('\xad', ''))
df['country']=split_tradition_country.str.get(1)

CPU times: user 10.6 ms, sys: 3.51 ms, total: 14.1 ms
Wall time: 14.2 ms


In [4]:
%%time

split_years_of_life = df['years_of_life'].str.split('–')
df['year_of_birth'] = pd.to_numeric(split_years_of_life.str.get(0), errors='coerce')
df['year_of_death'] = pd.to_numeric(split_years_of_life.str.get(1), errors='coerce')
df['epoch'] = df[['year_of_birth', 'year_of_death']].mean(axis=1) // 100

CPU times: user 27.4 ms, sys: 5.32 ms, total: 32.7 ms
Wall time: 40.2 ms


In [5]:
%%time

df['text'] = ''
for i in df.index:
    with open(df.loc[i, 'filename'], 'r', encoding='utf-8') as f:
        df.loc[i, 'text'] = f.read().replace('\xa0', ' ').replace('\n', ' ')

CPU times: user 3.1 s, sys: 752 ms, total: 3.85 s
Wall time: 8.93 s


In [6]:
STOPWORDS = nltk.corpus.stopwords.words('russian')

punctuation = set(punctuation + '«»—–…“”\n\t' + digits)
TABLE = str.maketrans({ch: ' ' for ch in punctuation})

In [7]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [8]:
df['text_tokenized'] = df.apply(lambda r: tokenize_text(r['text']), axis=1).values

In [9]:
mapping = {'COM': 'ADJ', 'APRO': 'DET', 'PART': 'PART', 'PR': 'ADP', 'ADV': 'ADV', 'INTJ': 'INTJ',
           'S': 'NOUN', 'V': 'VERB', 'CONJ': 'SCONJ', 'UNKN': 'X', 'ANUM': 'ADJ', 'NUM': 'NUM',
           'NONLEX': 'X', 'SPRO': 'PRON', 'ADVPRO': 'ADV', 'A': 'ADJ'}
pymystem = Mystem()

def pymystem_lemmatize_text(pymystem, text, mapping):
    lemmas = []
    lemmas_pos = []
    ana = pymystem.analyze(text.translate(TABLE))
    
    for word in ana:
        if word.get('analysis') and len(word.get('analysis')) > 0:
            lemma = word['analysis'][0]['lex'].lower().strip()
            
            if lemma not in STOPWORDS:
                lemmas.append(lemma)
                
                pos = word['analysis'][0]['gr'].split(',')[0]
                pos = pos.split('=')[0].strip()
                if pos in mapping:
                    lemmas_pos.append(lemma + '_' + mapping[pos]) # здесь мы конвертируем тэги
                else:
                    lemmas_pos.append(lemma + '_X') # на случай, если попадется тэг, которого нет в маппинге
    
    return lemmas, lemmas_pos

In [10]:
%%time

res = df.apply(lambda r: pymystem_lemmatize_text(pymystem, r['text'], mapping), axis=1).values
df_res = pd.DataFrame(list(res), columns = ['text_pymystem_list', 'text_pymystem_pos_list'])
df = df.join(df_res)

CPU times: user 39 s, sys: 533 ms, total: 39.5 s
Wall time: 3min 23s


In [11]:
%%time

df.to_pickle('metatable_preprocessed.pkl')

CPU times: user 2.99 s, sys: 1.03 s, total: 4.02 s
Wall time: 5.66 s


In [12]:
def get_grouped_dataframe(df, col_to_groupby, first_arg, *args):
    grouped = df.groupby([col_to_groupby])
    
    def func(data):
        if isinstance(data.iloc[0], str):
            return ' '.join
        elif isinstance(data.iloc[0], list):
            return sum
    
    df_res = grouped[first_arg].agg(func(df[first_arg])).reset_index()
    for arg in args:
        df_res[arg] = grouped[arg].agg(func(df[arg])).reset_index()[arg]
        
    return df_res

In [13]:
%%time

# без колонки text (Memory Error)

traditions = get_grouped_dataframe(df, 'tradition',
                                   'text_tokenized',
                                   'text_pymystem_list',
                                   'text_pymystem_pos_list')
traditions.to_pickle('traditions.pkl')

CPU times: user 30.9 s, sys: 516 ms, total: 31.4 s
Wall time: 32.7 s


In [14]:
%%time

countries = get_grouped_dataframe(df, 'country', 'text',
                                  'text_tokenized', 
                                  'text_pymystem_list',
                                  'text_pymystem_pos_list')
countries.to_pickle('countries.pkl')

CPU times: user 11.5 s, sys: 693 ms, total: 12.2 s
Wall time: 14.5 s


In [15]:
%%time

traditions_topic_modeling = df.groupby(['tradition'])['text_pymystem_list'].apply(list).reset_index()
traditions_topic_modeling.to_pickle('traditions_topic_modeling.pkl')

CPU times: user 685 ms, sys: 148 ms, total: 833 ms
Wall time: 1.57 s


In [16]:
%%time

grouped_by_authors = df.groupby(['author']).count()
needed_authors_list = grouped_by_authors.iloc[np.where(grouped_by_authors['text'] > 5)].index
needed_authors_grouped = df[df['author'].isin(needed_authors_list)].groupby(['author'])
needed_authors = needed_authors_grouped['text_pymystem_list'].apply(list).reset_index()

needed_authors.to_pickle('authors.pkl')

CPU times: user 359 ms, sys: 15.7 ms, total: 375 ms
Wall time: 503 ms


In [17]:
# test data for stylo

grouped = df.groupby(['name'])
needed_names_list = grouped.count().iloc[np.where(grouped['text_pymystem_list'].agg(sum).agg(len) >= 20000)].index
needed_names_grouped = df[df['name'].isin(needed_names_list)].groupby(['name'])
test = needed_names_grouped['text_pymystem_list'].apply(sum).reset_index()

In [18]:
needed_names_grouped['text_pymystem_list'].apply(sum).apply(len)

name
Д. А. Карельский    30229
М. Л. Гаспаров      28589
Name: text_pymystem_list, dtype: int64

In [19]:
for j in test.index:
    
    l = test.loc[j, 'text_pymystem_list']
    lst = [test.loc[j, 'text_pymystem_list'][i : i + 10000] for i in range(0, len(l) - (len(l) % 10000) , 10000)]

    for i, k in enumerate(lst):
        with open(os.getcwd() + '/corpus/_{}.txt'.format(test.loc[j, 'name'], i), 'w', encoding='utf-8') as f:
             f.write(' '.join(k))