In [1]:
import nltk
import re
import os
import pandas as pd
import numpy as np
from pymystem3 import Mystem
from string import punctuation, digits

In [2]:
%%time

names = ['filename', 'title', 'year', 'author', 'years_of_life',
         'time_summary', 'time_book', 'name', 'username', 'tradition_country']
df = pd.read_csv('metatable.tsv', sep='\t', names=names)

CPU times: user 28.6 ms, sys: 4.27 ms, total: 32.8 ms
Wall time: 44.6 ms


In [3]:
%%time

split_tradition_country = df['tradition_country'].str.split('->')
df['tradition']= split_tradition_country.str.get(0).apply(lambda x: x.replace('\xad', ''))
df['country']=split_tradition_country.str.get(1)

CPU times: user 10.6 ms, sys: 477 µs, total: 11.1 ms
Wall time: 11.1 ms


In [4]:
%%time

df['title'] = df['title'].str.replace('\xa0', ' ')
df['title'] = df['title'].str.split('<').str.get(0)

CPU times: user 11.1 ms, sys: 0 ns, total: 11.1 ms
Wall time: 10.9 ms


In [5]:
%%time

split_years_of_life = df['years_of_life'].str.split('–')
df['year_of_birth'] = pd.to_numeric(split_years_of_life.str.get(0), errors='coerce')
df['year_of_death'] = pd.to_numeric(split_years_of_life.str.get(1), errors='coerce')
df['epoch'] = df[['year_of_birth', 'year_of_death']].mean(axis=1) // 100

CPU times: user 27.4 ms, sys: 7.54 ms, total: 34.9 ms
Wall time: 42 ms


In [6]:
capital_letters = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
capital_letters_quotes = capital_letters + '«»''""“”  '

def separate_heading(text, capital_letters):
    heading = list(text[:200])
    for i, letter in enumerate(heading):
        if letter in capital_letters and i-1 > 0 and heading[i-1] not in capital_letters_quotes:
            heading[i] = ' ' + letter
    return ''.join(heading) + text[200:]

In [7]:
%%time

df['text'] = ''
for i in df.index:
    with open(df.loc[i, 'filename'], 'r', encoding='utf-8') as f:
        text = f.read().replace('\xad', ' ').replace('\xa0', ' ').replace('\n', ' ')
        text = separate_heading(text , capital_letters)
        df.loc[i, 'text'] = text

CPU times: user 3.25 s, sys: 736 ms, total: 3.99 s
Wall time: 5.8 s


In [8]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            tokens.append(word.lower().translate(TABLE))
    tokens = list(filter(lambda a: a not in '  ', tokens))
    return tokens

In [9]:
STOPWORDS = nltk.corpus.stopwords.words('russian') + ['свой', 'который', 'весь', 'это']

punctuation = set(punctuation + '«»—–…“”\n\t' + digits)
TABLE = str.maketrans({ch: ' ' for ch in punctuation})

In [10]:
df['text_tokenized'] = df.apply(lambda r: tokenize_text(r['text']), axis=1).values

In [11]:
mapping = {'COM': 'ADJ', 'APRO': 'DET', 'PART': 'PART', 'PR': 'ADP', 'ADV': 'ADV', 'INTJ': 'INTJ',
           'S': 'NOUN', 'V': 'VERB', 'CONJ': 'SCONJ', 'UNKN': 'X', 'ANUM': 'ADJ', 'NUM': 'NUM',
           'NONLEX': 'X', 'SPRO': 'PRON', 'ADVPRO': 'ADV', 'A': 'ADJ'}
pymystem = Mystem()

def pymystem_lemmatize_text(pymystem, text, mapping, del_stopwords=True):
    lemmas = []
    lemmas_pos = []
    ana = pymystem.analyze(text.translate(TABLE))
    
    for word in ana:
        if word.get('analysis') and len(word.get('analysis')) > 0:
            lemma = word['analysis'][0]['lex'].lower().strip()
            
            if del_stopwords:
                if lemma in STOPWORDS:
                    continue
                    
            lemmas.append(lemma)
                
            pos = word['analysis'][0]['gr'].split(',')[0]
            pos = pos.split('=')[0].strip()
            if pos in mapping:
                lemmas_pos.append(lemma + '_' + mapping[pos]) # здесь мы конвертируем тэги
            else:
                lemmas_pos.append(lemma + '_X') # на случай, если попадется тэг, которого нет в маппинге  
    
    return lemmas, lemmas_pos

In [12]:
%%time

res = df.apply(lambda r: pymystem_lemmatize_text(pymystem, r['text'], mapping), axis=1).values
df_res = pd.DataFrame(list(res), columns = ['text_pymystem_list', 'text_pymystem_pos_list'])
df = df.join(df_res)
df.to_pickle('metatable_preprocessed.pkl')

CPU times: user 55.6 s, sys: 1.7 s, total: 57.3 s
Wall time: 4min 48s


In [13]:
# %%time

# res = df.apply(lambda r: pymystem_lemmatize_text(pymystem, r['text'], mapping, del_stopwords=False), axis=1).values
# df_res = pd.DataFrame(list(res), columns = ['text_pymystem_list_with_stopwords',
#                                             'text_pymystem_pos_list_with_stopwords'])
# df = df.join(df_res)

In [14]:
# %%time

# df_to_save = df.drop(['text_pymystem_list_with_stopwords', 'text_pymystem_pos_list_with_stopwords'], 1)
# df_to_save.to_pickle('metatable_preprocessed.pkl')

In [15]:
def get_grouped_dataframe(df, col_to_groupby, first_arg, *args):
    grouped = df.groupby([col_to_groupby])
    
    def func(data):
        if isinstance(data.iloc[0], str):
            return ' '.join
        elif isinstance(data.iloc[0], list):
            return sum
    
    df_res = grouped[first_arg].agg(func(df[first_arg])).reset_index()
    for arg in args:
        df_res[arg] = grouped[arg].agg(func(df[arg])).reset_index()[arg]
        
    return df_res

In [16]:
%%time

traditions = get_grouped_dataframe(df, 'tradition',
                                   'text_tokenized',
                                   'text_pymystem_list',
                                   'text_pymystem_pos_list')
traditions.to_pickle('traditions.pkl')

CPU times: user 44.5 s, sys: 1.01 s, total: 45.5 s
Wall time: 46.6 s


In [17]:
%%time

countries = get_grouped_dataframe(df, 'country',
                                  'text_tokenized', 
                                  'text_pymystem_list',
                                  'text_pymystem_pos_list')
countries.to_pickle('countries.pkl')

CPU times: user 17.3 s, sys: 923 ms, total: 18.2 s
Wall time: 20.5 s


In [18]:
%%time

traditions_topic_modeling = df.groupby(['tradition'])['text_pymystem_list'].apply(list).reset_index()
traditions_topic_modeling.to_pickle('traditions_topic_modeling.pkl')

CPU times: user 965 ms, sys: 228 ms, total: 1.19 s
Wall time: 1.86 s


In [19]:
%%time

traditions_topic_modeling = df.groupby(['country'])['text_pymystem_list'].apply(list).reset_index()
traditions_topic_modeling.to_pickle('countries_topic_modeling.pkl')

CPU times: user 868 ms, sys: 197 ms, total: 1.07 s
Wall time: 1.86 s


In [20]:
%%time

grouped_by_authors = df.groupby(['author']).count()
needed_authors_list = grouped_by_authors.iloc[np.where(grouped_by_authors['text'] > 5)].index
needed_authors_grouped = df[df['author'].isin(needed_authors_list)].groupby(['author'])
needed_authors = needed_authors_grouped['text_pymystem_list'].apply(list).reset_index()

needed_authors.to_pickle('authors.pkl')

CPU times: user 688 ms, sys: 4.55 ms, total: 693 ms
Wall time: 753 ms


In [31]:
def create_files_for_stylo(df, column_name, column_text, n_tokens, dir_name):
    grouped = df.groupby([column_name])
    needed_list = grouped.count().iloc[np.where(grouped[column_text].agg(sum).agg(len) >= n_tokens * 2)].index
    needed_grouped = df[df[column_name].isin(needed_list)].groupby([column_name])
    df_res = needed_grouped[column_text].apply(sum).reset_index()
    
    for user_id in df_res.index:
        l = df_res.loc[user_id, column_text]
        lst = [df_res.loc[user_id, column_text][i : i + n_tokens] for i in range(0, len(l) - (len(l) % n_tokens),
                                                                             n_tokens)]
        for data_id, data in enumerate(lst):
            print('{}: {} whole_len:{} part:{} len:{}'.format(column_name, df_res.loc[user_id, column_name],
                                                    needed_grouped[column_text].apply(sum).apply(len).iloc[user_id],
                                                    data_id, len(data)))
            with open(os.getcwd() + '/corpus/{}/{}_part{}.txt'.format(dir_name, 
                                                                      df_res.loc[user_id, column_name],
                                                                      data_id + 1), 'w', encoding='utf-8') as f:
                 f.write(' '.join(data))

In [21]:
create_files_for_stylo(df, 'username', 'text_tokenized', 5000, 'username_tokenized_5000')

username: 123 whole_len:21786 part:0 len:5000
username: 123 whole_len:21786 part:1 len:5000
username: 123 whole_len:21786 part:2 len:5000
username: 123 whole_len:21786 part:3 len:5000
username: 152 whole_len:12128 part:0 len:5000
username: 152 whole_len:12128 part:1 len:5000
username: 165 whole_len:10630 part:0 len:5000
username: 165 whole_len:10630 part:1 len:5000
username: 226 whole_len:18177 part:0 len:5000
username: 226 whole_len:18177 part:1 len:5000
username: 226 whole_len:18177 part:2 len:5000
username: 227 whole_len:13709 part:0 len:5000
username: 227 whole_len:13709 part:1 len:5000
username: 233 whole_len:13579 part:0 len:5000
username: 233 whole_len:13579 part:1 len:5000
username: belov whole_len:27915 part:0 len:5000
username: belov whole_len:27915 part:1 len:5000
username: belov whole_len:27915 part:2 len:5000
username: belov whole_len:27915 part:3 len:5000
username: belov whole_len:27915 part:4 len:5000
username: erkhov whole_len:24845 part:0 len:5000
username: erkhov whol

In [22]:
create_files_for_stylo(df, 'username', 'text_tokenized', 10000, 'username_tokenized_10000')

username: 123 whole_len:21786 part:0 len:10000
username: 123 whole_len:21786 part:1 len:10000
username: belov whole_len:27915 part:0 len:10000
username: belov whole_len:27915 part:1 len:10000
username: erkhov whole_len:24845 part:0 len:10000
username: erkhov whole_len:24845 part:1 len:10000
username: gasparov whole_len:45144 part:0 len:10000
username: gasparov whole_len:45144 part:1 len:10000
username: gasparov whole_len:45144 part:2 len:10000
username: gasparov whole_len:45144 part:3 len:10000
username: karelsky whole_len:46381 part:0 len:10000
username: karelsky whole_len:46381 part:1 len:10000
username: karelsky whole_len:46381 part:2 len:10000
username: karelsky whole_len:46381 part:3 len:10000
username: sanovich whole_len:23980 part:0 len:10000
username: sanovich whole_len:23980 part:1 len:10000
username: shishkin whole_len:27638 part:0 len:10000
username: shishkin whole_len:27638 part:1 len:10000
username: smirnov whole_len:24079 part:0 len:10000
username: smirnov whole_len:24079

In [23]:
create_files_for_stylo(df, 'username', 'text_pymystem_list_with_stopwords', 5000, 'username_lemmatized_5000')

username: 123 whole_len:21948 part:0 len:5000
username: 123 whole_len:21948 part:1 len:5000
username: 123 whole_len:21948 part:2 len:5000
username: 123 whole_len:21948 part:3 len:5000
username: 152 whole_len:12218 part:0 len:5000
username: 152 whole_len:12218 part:1 len:5000
username: 165 whole_len:10618 part:0 len:5000
username: 165 whole_len:10618 part:1 len:5000
username: 226 whole_len:18170 part:0 len:5000
username: 226 whole_len:18170 part:1 len:5000
username: 226 whole_len:18170 part:2 len:5000
username: 227 whole_len:13718 part:0 len:5000
username: 227 whole_len:13718 part:1 len:5000
username: 233 whole_len:13689 part:0 len:5000
username: 233 whole_len:13689 part:1 len:5000
username: belov whole_len:28119 part:0 len:5000
username: belov whole_len:28119 part:1 len:5000
username: belov whole_len:28119 part:2 len:5000
username: belov whole_len:28119 part:3 len:5000
username: belov whole_len:28119 part:4 len:5000
username: erkhov whole_len:24937 part:0 len:5000
username: erkhov whol

In [24]:
create_files_for_stylo(df, 'username', 'text_pymystem_list_with_stopwords', 10000, 'username_lemmatized_10000')

username: 123 whole_len:21948 part:0 len:10000
username: 123 whole_len:21948 part:1 len:10000
username: belov whole_len:28119 part:0 len:10000
username: belov whole_len:28119 part:1 len:10000
username: erkhov whole_len:24937 part:0 len:10000
username: erkhov whole_len:24937 part:1 len:10000
username: gasparov whole_len:45426 part:0 len:10000
username: gasparov whole_len:45426 part:1 len:10000
username: gasparov whole_len:45426 part:2 len:10000
username: gasparov whole_len:45426 part:3 len:10000
username: karelsky whole_len:46776 part:0 len:10000
username: karelsky whole_len:46776 part:1 len:10000
username: karelsky whole_len:46776 part:2 len:10000
username: karelsky whole_len:46776 part:3 len:10000
username: sanovich whole_len:24143 part:0 len:10000
username: sanovich whole_len:24143 part:1 len:10000
username: shishkin whole_len:27750 part:0 len:10000
username: shishkin whole_len:27750 part:1 len:10000
username: smirnov whole_len:24170 part:0 len:10000
username: smirnov whole_len:24170

In [25]:
column_name = 'author'
column_text = 'text_tokenized'
n_tokens = 5000

grouped = df.groupby([column_name])
needed_list = grouped.count().iloc[np.where(grouped[column_text].agg(sum).agg(len) >= n_tokens * 2)].index
needed_grouped = df[df[column_name].isin(needed_list)].groupby([column_name])
needed_grouped[column_text].apply(sum).apply(len)

author
Сайкаку    10190
Name: text_tokenized, dtype: int64

In [26]:
column_name = 'author'
column_text = 'text_pymystem_list_with_stopwords'
n_tokens = 5000

grouped = df.groupby([column_name])
needed_list = grouped.count().iloc[np.where(grouped[column_text].agg(sum).agg(len) >= n_tokens * 2)].index
needed_grouped = df[df[column_name].isin(needed_list)].groupby([column_name])
needed_grouped[column_text].apply(sum).apply(len)

author
Сайкаку    10335
Name: text_pymystem_list_with_stopwords, dtype: int64

In [35]:
create_files_for_stylo(df, 'author', 'text_tokenized', 5000, 'author_tokenized_5000')

author: Сайкаку whole_len:10190 part:0 len:5000
author: Сайкаку whole_len:10190 part:1 len:5000


In [36]:
create_files_for_stylo(df, 'author', 'text_pymystem_list_with_stopwords', 5000, 'author_lemmatized_5000')

author: Сайкаку whole_len:10335 part:0 len:5000
author: Сайкаку whole_len:10335 part:1 len:5000
