In [2]:
import os, spacy, pandas as pd

In [3]:
nlp = spacy.load("ru_core_news_lg")

In [16]:
nlp.max_length = 4000000

In [18]:
class Text():

    rows = []

    def __init__(self, text, genre, fname):
        self.text = text
        self.genre = genre
        self.fname = fname
        self.lemmas = self.get_lemmas()
    
    def get_lemmas(self):
        self.lemmas = []
        doc = nlp(self.text, disable = ['ner', 'parser'])
        for token in doc:
            if token.lemma_.isalpha():
                self.lemmas.append(token.lemma_)
        return self.lemmas

    def get_row(self):
        row = (self.fname, self.genre, len(self.lemmas), len(set(self.lemmas)))
        Text.rows.append(row)
        return row

In [19]:
genres = ['ballads', 'elegies', 'songs', 'novels']

for genre in genres:

    filenames = os.listdir('../datasets/' + genre)

    for file in filenames:
        if file != '.DS_Store':
            with open('../datasets/' + genre + '/' + file, 'r', encoding='utf-8') as f:
                lines = f.read()
        
            text_piece = Text(lines, genre, file) 
            text_piece.get_row()

In [20]:
df = pd.DataFrame(Text.rows, columns=['filename', 'genre', 'all lemmas', 'unique lemmas'])

In [21]:
print('Total number of texts by genres')
df.groupby(['genre'])['filename'].count()

Total number of texts by genres


genre
ballads    189
elegies    510
novels     506
songs      623
Name: filename, dtype: int64

In [22]:
print('Total number of tokens and unique lemmas by genres')
df.groupby(['genre']).sum()

Total number of tokens and unique lemmas by genres


Unnamed: 0_level_0,all lemmas,unique lemmas
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
ballads,48510,31443
elegies,115313,79699
novels,29582914,3728290
songs,73276,52185


In [23]:
print('Mean number of tokens and unique lemmas by genres')
df.groupby(['genre']).mean()

Mean number of tokens and unique lemmas by genres


Unnamed: 0_level_0,all lemmas,unique lemmas
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
ballads,256.666667,166.365079
elegies,226.103922,156.272549
novels,58464.256917,7368.162055
songs,117.617978,83.764045


In [24]:
print(df)

                                  filename    genre  all lemmas  unique lemmas
0                           175_1842_E.txt  ballads         253            131
1                     47_1842_Lyubimov.txt  ballads          50             35
2                 183_1844_Zhadovskaya.txt  ballads          63             53
3                    157_1847_Nekrasov.txt  ballads         279            186
4                     206_1841_Tolstoy.txt  ballads         257            141
...                                    ...      ...         ...            ...
1823      sukhonin.na_rubezhe_stoletiy.txt   novels      118794          12463
1824                boborykin.dolgo_li.txt   novels       35523           5805
1825               merder.vorotyntsevy.txt   novels       50284           7628
1826    dal.pavel_alekseyevich_igrivyy.txt   novels       26227           4777
1827  maklakova_nelidova.devochka_lida.txt   novels       42876           5588

[1828 rows x 4 columns]
