In [0]:
from IPython.display import clear_output

In [0]:
!pip install pycodestyle flake8 pycodestyle_magic
clear_output()

In [0]:
%load_ext pycodestyle_magic

In [0]:
import re
import time
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
from google.colab import files
from unicodedata import normalize
from urllib.request import urlopen, Request

## **Web crawling**

In [0]:
search_url = 'http://processing.ruscorpora.ru/search.xml?sort=i_grtagging&lang=ru&dpp=50&text=meta&mode=para&env=alpha&p='
explain_url = 'http://processing.ruscorpora.ru/search-explan.xml?sort=i_grtagging&lang=ru&dpp=50&&mode=para&env=alpha&p='
docinfo_url = '&text=document-info&language=ru&docid='

In [0]:
texts_original = []
texts_translated = []
authors = []
authors_rus = []
second_authors = []
second_authors_rus = []
translators = []
second_translators = []
langs = []
spheres = []
creation_dates = []
translation_dates = []
sent_numbers = []
token_numbers = []

In [0]:
count = 0

for page_nbr in tqdm(range(55)):
    page = urlopen(Request(url=search_url + str(page_nbr))).read()
    page_parsed = BeautifulSoup(page, 'html.parser')
    tags = page_parsed.find_all('span', class_='b-doc-expl')
    for tag in tqdm(tags):
        text_original = np.nan
        text_translated = np.nan
        author = np.nan
        second_author = np.nan
        author_translated = np.nan
        second_author_translated = np.nan
        translator = np.nan
        second_translator = np.nan
        lang_from = np.nan
        lang_to = np.nan
        lang = np.nan
        sphere = np.nan
        creation_date = np.nan
        translation_date = np.nan
        sent_number = np.nan
        token_number = np.nan
        docinfo = urlopen(Request(url=explain_url + str(page_nbr) +
                                  docinfo_url + tag['explain'])).read()
        docinfo_parsed = BeautifulSoup(docinfo, 'html.parser')
        table = docinfo_parsed.find('table')
        rows = table.find_all('tr')
        for row in rows:
            tds = row.find_all('td')
            key, value = tds[0].get_text(), tds[1].get_text()
            if key == 'Автор':
                value = re.sub(r'\(\d+\)', '', value)
                match = re.search(r'(.*)\((.*)\)', value)
                authors_ = match.group(1).split(',')
                authors_translated = match.group(2).split(',')
                author = authors_[0].strip()
                author_translated = authors_translated[0].strip()
                if len(authors_) > 1:
                    second_author = authors_[1].strip()
                if len(authors_translated) > 1:
                    second_author_translated = authors_translated[1].strip()
            elif key == 'Название':
                value = value.replace('(1)', '1')
                value = value.replace('(2)', '2')
                value = value.replace('(3)', '3')
                match = re.search(r'(.*)\((.*)\)', value)
                text_original = match.group(1).strip()
                text_translated = match.group(2).strip()
            elif key == 'Дата создания':
                value = re.sub(r'\(\d*\)', '', value)
                value = re.sub(r' \| \d*', '', value)
                value = re.sub(r'\.\d*', '', value)
                value = re.sub(r'\d*-', '', value)
                creation_date = int(value)
            elif key == 'Сфера функционирования':
                sphere = value
            elif key == 'Предложений':
                sent_number = int(value)
            elif key == 'Словоформ':
                token_number = int(value)
            elif key == 'Язык':
                lang_from = value
            elif key == 'Переводчик':
                value = re.sub(r'\(\d+\)', '', value)
                translators_ = value.split(',')
                translator = translators_[0].strip()
                if len(translators_) > 1:
                    second_translator = translators_[1].strip()
            elif key == 'Язык перевода':
                lang_to = value
            elif key == 'Год перевода':
                value = re.sub(r'\(\d*\)', '', value)
                value = re.sub(r' \| \d*', '', value)
                value = re.sub(r'\d*-', '', value)
                value = re.sub(r'\.\d*', '', value)
                translation_date = int(value)
        if lang_from == np.nan and lang_to == np.nan:
            continue
        authors.append(author)
        second_authors.append(second_author)
        if lang_from == 'rus':
            lang = lang_to
            authors_rus.append(author)
            second_authors_rus.append(second_author)
        elif lang_to == 'rus':
            lang = lang_from
            authors_rus.append(author_translated)
            second_authors_rus.append(second_author_translated)
        langs.append(lang)
        spheres.append(sphere)
        texts_original.append(text_original)
        texts_translated.append(text_translated)
        translators.append(translator)
        second_translators.append(second_translator)
        creation_dates.append(creation_date)
        translation_dates.append(translation_date)
        sent_numbers.append(sent_number)
        token_numbers.append(token_number)
        count += 1
        print(f'\n\n  {count} texts parsed.')
        time.sleep(2)

clear_output()

In [0]:
with open('texts_original.pkl', 'wb') as f:
    pickle.dump(texts_original, f)

with open('texts_translated.pkl', 'wb') as f:
    pickle.dump(texts_translated, f)

with open('authors.pkl', 'wb') as f:
    pickle.dump(authors, f)

with open('authors_rus.pkl', 'wb') as f:
    pickle.dump(authors_rus, f)

with open('second_authors.pkl', 'wb') as f:
    pickle.dump(second_authors, f)

with open('second_authors_rus.pkl', 'wb') as f:
    pickle.dump(second_authors_rus, f)

with open('translators.pkl', 'wb') as f:
    pickle.dump(translators, f)

with open('second_translators.pkl', 'wb') as f:
    pickle.dump(second_translators, f)

with open('langs.pkl', 'wb') as f:
    pickle.dump(langs, f)

with open('spheres.pkl', 'wb') as f:
    pickle.dump(spheres, f)

with open('creation_dates.pkl', 'wb') as f:
    pickle.dump(creation_dates, f)

with open('translation_dates.pkl', 'wb') as f:
    pickle.dump(translation_dates, f)

with open('sent_numbers.pkl', 'wb') as f:
    pickle.dump(sent_numbers, f)

with open('token_numbers.pkl', 'wb') as f:
    pickle.dump(token_numbers, f)

In [0]:
!tar -cvzf db_corpora_data.tar.gz *.pkl
clear_output()

In [0]:
files.download('db_corpora_data.tar.gz')

## **Код, открывающий полученные выше pkl-файлы**

In [0]:
!wget https://github.com/vyhuholl/parallel_corpora_database/raw/master/data/db_corpora_data.tar.gz
clear_output()

In [6]:
!tar xvzf db_corpora_data.tar.gz

authors.pkl
authors_rus.pkl
creation_dates.pkl
langs.pkl
second_authors.pkl
second_authors_rus.pkl
second_translators.pkl
sent_numbers.pkl
spheres.pkl
texts_original.pkl
texts_translated.pkl
token_numbers.pkl
translation_dates.pkl
translators.pkl


In [0]:
!rm db_corpora_data.tar.gz

In [0]:
with open('texts_original.pkl', 'rb') as f:
    texts_original = pickle.load(f)

with open('texts_translated.pkl', 'rb') as f:
    texts_translated = pickle.load(f)

with open('authors.pkl', 'rb') as f:
    authors = pickle.load(f)

with open('authors_rus.pkl', 'rb') as f:
    authors_rus = pickle.load(f)

with open('second_authors.pkl', 'rb') as f:
    second_authors = pickle.load(f)

with open('second_authors_rus.pkl', 'rb') as f:
    second_authors_rus = pickle.load(f)

with open('translators.pkl', 'rb') as f:
    translators = pickle.load(f)

with open('second_translators.pkl', 'rb') as f:
    second_translators = pickle.load(f)

with open('langs.pkl', 'rb') as f:
    langs = pickle.load(f)

with open('spheres.pkl', 'rb') as f:
    spheres = pickle.load(f)

with open('creation_dates.pkl', 'rb') as f:
    creation_dates = pickle.load(f)

with open('translation_dates.pkl', 'rb') as f:
    translation_dates = pickle.load(f)

with open('sent_numbers.pkl', 'rb') as f:
    sents_numbers = pickle.load(f)

with open('token_numbers.pkl', 'rb') as f:
    tokens_numbers = pickle.load(f)

## **Data cleaning**

In [0]:
for i in range(len(authors)):
    if type(authors[i]) == float:
        continue
    authors[i] = normalize('NFKD', authors[i])
    authors[i] = authors[i].strip('(')
    authors[i] = authors[i].strip(')')
    authors[i] = authors[i].strip('[')
    authors[i] = authors[i].strip(']')
    authors[i] = authors[i].strip('.')
    authors[i] = authors[i].replace(normalize(
        'NFKD', ' (Die Brüder Karamasow'), '')
    authors[i] = authors[i].replace('_', ' ')
    authors[i] = authors[i].replace(' и др', '')
    authors[i] = authors[i].replace('\ufeff', '')
    authors[i] = authors[i].replace('(BZgA', '(BZgA)')
    authors[i] = authors[i].replace(' (Г. Е. Аронов', '')
    authors[i] = authors[i].replace(' (Anna Karenina', '')
    authors[i] = authors[i].replace(' (Carlos Salvador) (TURESPAÑA', '')
    authors[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])', r'\1 \2', authors[i])
    authors[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])', r'\1 \2', authors[i])

In [0]:
to_replace = {'': np.nan,
    'EDIТORIAL FISA ESCUDO DE ORO S. A': 'EDITORIAL FISA ESCUDO DE ORO',
    'Vladimir Nabokov': 'Владимир Набоков',
    'Іван Котлярівський': 'Іван Котляревський',
    'Алесь Адамович': 'Алесь Адамовіч',
    'Борис Гринченко': 'Борис Грінченко',
    'В. В. Набоков': 'Владимир Набоков',
    'Григорий Квитка-Основьяненко': "Григорій Квітка-Основ'яненко",
    'И. А. Гончаров': 'Иван Гончаров',
    'М. Горький': 'Максим Горький',
    'М. А. Булгаков': 'Михаил Булгаков',
    'М. М. Бахтин': 'Михаил Бахтин',
    'Н. В. Гоголь': 'Николай Гоголь',
    'Народная казка': 'Народна казка',
    'С. А. Алексиевич': 'Светлана Алексиевич',
    'С. В. Михалков': 'Сергей Михалков',
    'С. Д. Довлатов': 'Сергей Довлатов',
    'С. Козлов': 'С. Г. Козлов',
    'Т. Г. Шевченко': 'Тарас Шевченко',
    'Ю. Нікітін': 'Ю. Никитин'
}

In [0]:
authors = [to_replace[elem] if elem in to_replace else elem for elem in authors]

In [0]:
for i in range(len(authors_rus)):
    if type(authors_rus[i]) == float:
        continue
    authors_rus[i] = normalize('NFKD', authors_rus[i])
    authors_rus[i] = authors_rus[i].strip(')')
    authors_rus[i] = authors_rus[i].strip('[')
    authors_rus[i] = authors_rus[i].strip(']')
    authors_rus[i] = authors_rus[i].strip('.')
    authors_rus[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])',
                            r'\1 \2', authors_rus[i])
    authors_rus[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])',
                            r'\1 \2', authors_rus[i])
    authors_rus[i] = authors_rus[i].replace(normalize(
        'NFKD', ' (Die Brüder Karamasow'), '')
    authors_rus[i] = authors_rus[i].replace(' и др', '')
    authors_rus[i] = authors_rus[i].replace(' (Г. Е. Аронов', '')
    authors_rus[i] = authors_rus[i].replace(' (Anna Karenina', '')
    authors_rus[i] = authors_rus[i].replace(' (Carlos Salvador) (TURESPAÑA', '')

In [0]:
to_replace = {'': np.nan,
              'EDITORIAL FISA ESCUDO DE ORO': 'Издательство "EDITORIAL FISA ESCUDO DE ORO S. A."',
              'Издательство "EDIТORIAL FISA ESCUDO DE ORO S. A."': 'Издательство "EDITORIAL FISA ESCUDO DE ORO S. A."',
              'В. В. Набоков': 'Владимир Набоков',
              'И. А. Бунин': 'Иван Бунин',
              'И. А. Гончаров': 'Иван Гончаров',
              'М. А. Булгаков': 'Михаил Булгаков',
              'М. М. Бахтин': 'Михаил Бахтин',
              'М. Горький': 'Максим Горький',
              'Микола Хвылевой': 'Микола Хвылевый',
              'Н. В. Гоголь': 'Николай Гоголь',
              'Народна сказка': 'Народная сказка',
              'С. А. Алексиевич': 'Светлана Алексиевич',
              'С. В. Михалков': 'Сергей Михалков',
              'С. Д. Довлатов': 'Сергей Довлатов',
              'С. Козлов': 'С. Г. Козлов',
              'Т. Г. Шевченко': 'Тарас Шевченко',
              'Ю. Нікітін': 'Ю. Никитин'
}

In [0]:
authors_rus = [to_replace[elem] if elem in to_replace else elem
               for elem in authors_rus]

In [0]:
for i in range(len(second_authors)):
    if type(second_authors[i]) == float:
        continue
    second_authors[i] = normalize('NFKD', second_authors[i])
    second_authors[i] = second_authors[i].strip('(')
    second_authors[i] = second_authors[i].strip(')')
    second_authors[i] = second_authors[i].strip('[')
    second_authors[i] = second_authors[i].strip(']')
    second_authors[i] = second_authors[i].strip('.')
    second_authors[i] = second_authors[i].replace(' и др', '')
    second_authors[i] = second_authors[i].replace(
        ' [перевод с английского повести Э. Хемингуэя', '')

In [0]:
to_replace = {'': np.nan,
    'пересказ повести А. А. Милна': 'A. A. Milne',
    'пересказ повести П. Трэверс': 'P. L. Travers',
    'Том Доннелли': 'Tom Donnelly',
    'Эрнст Вельтеке': 'Ernst Welteke'
}

In [0]:
second_authors = [to_replace[elem] if elem in to_replace else elem
                  for elem in second_authors]

In [0]:
for i in range(len(second_authors_rus)):
    if type(second_authors_rus[i]) == float:
        continue
    second_authors_rus[i] = normalize('NFKD', second_authors_rus[i])
    second_authors_rus[i] = second_authors_rus[i].strip('(')
    second_authors_rus[i] = second_authors_rus[i].strip(')')
    second_authors_rus[i] = second_authors_rus[i].strip('[')
    second_authors_rus[i] = second_authors_rus[i].strip(']')
    second_authors_rus[i] = second_authors_rus[i].strip('.')
    second_authors_rus[i] = second_authors_rus[i].replace(' и др', '')
    second_authors_rus[i] = second_authors_rus[i].replace(
        ' [перевод с английского повести Э. Хемингуэя', '')

In [0]:
to_replace = {'': np.nan,
    'пересказ повести А. А. Милна': 'А. А. Милн',
    'пересказ повести П. Трэверс': 'П. Трэверс',
}

In [0]:
second_authors_rus = [to_replace[elem] if elem in to_replace else elem
                      for elem in second_authors_rus]

In [0]:
for i in range(len(translators)):
    if type(translators[i]) == float:
        continue
    translators[i] = normalize('NFKD', translators[i])
    if not (' / ' in translators[i] or translators[i].startswith('http')):
        translators[i] = translators[i].split('/')[0]
    if translators[i].find('inosmi.ru') != -1:
        translators[i] = 'inosmi.ru'
    translators[i] = translators[i].replace('_', ' ')
    translators[i] = translators[i].replace(' (?)', '')
    translators[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])',
                            r'\1 \2', translators[i])
    translators[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])',
                            r'\1 \2', translators[i])
    translators[i] = translators[i].replace(' (Современник', '')
    translators[i] = translators[i].strip()

In [0]:
to_replace = {'': np.nan, '/': np.nan,
              'H. Волжина': 'Н. Волжина',
              'Inosmi.ru': 'inosmi.ru',
              'http://www.happydoctor.ru/': 'www.happydoctor.ru',
              'Vladimir Nabokov': 'Владимир Набоков',
              'А. Жук': 'Алесь Жук',
              'В. Набоков': 'Владимир Набоков',
              'В. Щедрина': 'Валентина Щедрина',
              'ГАИ усилит контроль за соблюдением ПДД на дорогах Беларуси': np.nan,
              'Г. Киселев': 'Геннадий Киселев',
              'Г. Петников': 'Григорий Петников',
              'Е. Костюкович': 'Елена Костюкович',
              'Ж. Эзит': 'Жанна Эзит',
              'И. Карабутенко': 'Иван Карабутенко',
              'ИноСМИ': 'inosmi.ru',
              'И. Шкаровская': 'Ирина Шкаровская',
              'Кирила Георгиева': 'Кирил Георгиев',
              'Л. Блюмфильд': 'Л. Блюмфельд',
              'Л. Салавей': 'Леў Салавей',
              'Лев Соловей': 'Леў Салавей',
              'Леся Украинка': 'Леся Українка',
              'Л. Кремнева': 'Людмила Кремнева',
              'М. Горбачев': 'М. Горбачёв',
              'М. Пригара': 'Мария Пригара',
              'Н. Вольпин': 'Н. Д. Вольпин',
              'Н. Чуковский': 'Н. К. Чуковский',
              'Народная сказка': np.nan,
              'Р. Райт-Ковалева': 'Р. Райт-Ковалёва',
              'С 25 марта "Белавиа" переходит на летнее расписание рейсов': np.nan,
              'С. Апт': 'С. К. Апт',
              normalize('NFKD', 'Трамвай Вар’ят'): np.nan,
              'Христо Радески': 'Христо Радевски',
              'Ю. Абызов': 'Юрий Абызов',
              'Ю. Каппе': 'Ю. С. Каппе',
              'Ю. Яхнина': 'Ю. Я. Яхнина',
              'Եսայան Դ. / Esayan D.': 'Եսայան Դորա / Esayan Dora'
}

In [0]:
translators = [to_replace[elem] if elem in to_replace else elem
               for elem in translators]

In [0]:
for i in range(len(second_translators)):
    if type(second_translators[i]) == float:
        continue
    second_translators[i] = normalize('NFKD', second_translators[i])
    second_translators[i] = second_translators[i].replace('1838 г.)', '')
    second_translators[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])',
                            r'\1 \2', second_translators[i])
    second_translators[i] = re.sub(r'([А-ЯA-ZЁ]\.)([А-ЯA-ZЁ])',
                            r'\1 \2', second_translators[i])

In [0]:
to_replace = {'': np.nan,
              'Michel-R. de Hoffmann': 'Michel-Rostislav Hofmann',
              'Vladimir Nabokov': 'Владимир Набоков',
              'К. Старосельская': 'К. Я. Старосельская',
              'М. Горбачев': 'М. Горбачёв',
              'Н. Тренева': 'Н. К. Тренева',
              'Ю. Абызов': 'Юрий Абызов',
              'Ю. Каппе': 'Ю. С. Каппе',
              'Ю. Яхнина': 'Ю. Я. Яхнина',
}

In [0]:
second_translators = [to_replace[elem] if elem in to_replace else elem
                      for elem in second_translators]

In [0]:
texts_original = np.array(
    [normalize('NFKD', elem) for elem in texts_original])
texts_translated = np.array(
    [normalize('NFKD', elem) for elem in texts_translated])

authors = np.array(authors).astype(str)
authors_rus = np.array(authors_rus).astype(str)
second_authors = np.array(second_authors).astype(str)
second_authors_rus = np.array(second_authors_rus).astype(str)
translators = np.array(translators).astype(str)
second_translators = np.array(second_translators).astype(str)

langs = np.array(langs)
spheres = np.array(spheres)

creation_dates = np.array(np.nan_to_num(
    creation_dates, nan=-1)).astype(int)
translation_dates = np.array(np.nan_to_num(
    translation_dates, nan=-1)).astype(int)

sents_numbers = np.array(sents_numbers)
tokens_numbers = np.array(tokens_numbers)

## **Building tables**

In [0]:
lang_names = np.array(['eng', 'arm', 'bash', 'bel', 'bul', 'bua',
                       'esp', 'ita', 'zho', 'lav', 'lit', 'ger',
                       'pol', 'ukr', 'fra', 'fin', 'cze', 'sve', 'est'])

In [0]:
lang_to_id = {lang_names[i]: i + 1 for i in range(len(lang_names))}
lang_to_id[np.nan] = 0

In [0]:
author_names = np.union1d(authors, second_authors)
author_names = author_names[author_names != 'nan']

In [0]:
author_to_id = {author_names[i]: i + 1 for i in range(len(author_names))}
author_to_id['nan'] = 0

In [0]:
translator_names = np.union1d(translators, second_translators)
translator_names = translator_names[translator_names != 'nan']

In [0]:
translator_to_id = {translator_names[i]: i + 1
                    for i in range(len(translator_names))}
translator_to_id['nan'] = 0

In [0]:
corp_data = {'lang': lang_names,
             'texts_number': [],
             'sents_number': [],
             'tokens_number': [],
             'tokens_percent': []
             }

text_data = {'corpus_id': [],
             'author_id': [],
             'second_author_id': [],
             'translator_id': [],
             'second_translator_id': [],
             'name_original': texts_original,
             'name_translated': texts_translated,
             'sphere': spheres,
             'creation_date': creation_dates,
             'translation_date': translation_dates,
             'sents_number': sents_numbers,
             'tokens_number': tokens_numbers
             }

auth_data = {'name': author_names,
             'name_rus': [],
             'texts_number': [],
             'texts_percent': []
             }

tran_data = {'name': translator_names,
             'texts_number': [],
             'texts_percent': []
             }

### **Texts table**

In [0]:
text_data['corpus_id'] = np.array([lang_to_id[elem] for elem in langs])

text_data['author_id'] = np.array([author_to_id[elem]
                                   for elem in authors])
text_data['second_author_id'] = np.array([author_to_id[elem]
                                          for elem in second_authors])

text_data['translator_id'] = np.array([translator_to_id[elem]
                                       for elem in translators])
text_data['second_translator_id'] = np.array([translator_to_id[elem]
                                              for elem in second_translators])

In [0]:
texts_df = pd.DataFrame.from_dict(text_data)
texts_df.index = np.arange(1, len(texts_df) + 1)

In [37]:
texts_df.head()

Unnamed: 0,corpus_id,author_id,second_author_id,translator_id,second_translator_id,name_original,name_translated,sphere,creation_date,translation_date,sents_number,tokens_number
1,1,3,0,196,0,Preventing the Balkanization of the Internet [...,Как предотвратить балканизацию интернета,"нехудожественная, публицистика",2018,2018,72,1720
2,1,4,0,196,0,The reality behind Putin’s fire and fury [The ...,Что кроется за путинским гневом и пламенем,"нехудожественная, публицистика",2018,2018,66,1416
3,1,17,0,196,0,Chechnya’s leader uses World Cup to extend his...,Руководитель Чечни воспользовался ЧМ-2018 для ...,"нехудожественная, публицистика",2018,2018,86,1906
4,1,21,0,196,0,Israeli intervention in US elections ‘vastly o...,Вмешательство Израиля в американские выборы за...,"нехудожественная, публицистика",2018,2018,46,1210
5,1,23,0,196,0,Rome and Moscow Fight for Control of Eastern O...,Рим и Москва борются за контроль над православ...,"нехудожественная, публицистика",2018,2018,74,1532


In [0]:
texts_df.to_csv('texts.csv')

In [0]:
files.download('texts.csv')

### **Corpora table**

In [0]:
total = np.sum(texts_df['tokens_number'])

corp_data['texts_number'] = np.array(
    texts_df['corpus_id'].value_counts().sort_index())

corp_data['sents_number'] = np.array(
    texts_df.groupby('corpus_id').aggregate(np.sum)['sents_number'])

corp_data['tokens_number'] = np.array(
    texts_df.groupby('corpus_id').aggregate(np.sum)['tokens_number'])

corp_data['tokens_percent'] = np.array(
    texts_df.groupby('corpus_id').aggregate(np.sum)['tokens_number'] / total)

In [0]:
corpora_df = pd.DataFrame.from_dict(corp_data)
corpora_df.index = np.arange(1, len(corpora_df) + 1)

In [42]:
corpora_df.head()

Unnamed: 0,lang,texts_number,sents_number,tokens_number,tokens_percent
1,eng,677,1852353,28363771,0.289153
2,arm,28,188336,2211635,0.022546
3,bash,124,124246,550286,0.00561
4,bel,286,1108749,10855454,0.110666
5,bul,42,306763,3774650,0.038481


In [0]:
corpora_df.to_csv('corpora.csv')

In [0]:
files.download('corpora.csv')

### **Authors table**

In [0]:
authors_all = list(authors) + list(second_authors)
authors_all_rus = list(authors_rus) + list(second_authors_rus)

clean = [authors_all[i] for i in range(len(authors_all))
         if authors_all[i] != 'nan' and authors_all_rus[i] != 'nan']

clean_rus = [authors_all_rus[i] for i in range(len(authors_all))
             if authors_all[i] != 'nan' and authors_all_rus[i] != 'nan']

In [0]:
auth_data['name_rus'] = np.array([clean_rus[clean.index(name)]
                                  if name in clean else np.nan
                                  for name in author_names]).astype(str)

In [0]:
value_counts_1 = texts_df['author_id'].value_counts()
value_counts_2 = texts_df['second_author_id'].value_counts()

id_counts_1 = [value_counts_1[x] if x in value_counts_1 else 0
               for x in range(1, 720)]
id_counts_2 = [value_counts_2[x] if x in value_counts_2 else 0
               for x in range(1, 720)]

id_counts_total = [x + y for x, y in zip(id_counts_1, id_counts_2)]

In [0]:
auth_data['texts_number'] = np.array(id_counts_total)

In [0]:
total = np.sum(auth_data['texts_number'])

auth_data['texts_percent'] = auth_data['texts_number'] / total

In [0]:
authors_df = pd.DataFrame.from_dict(auth_data)
authors_df.index = np.arange(1, len(authors_df) + 1)

In [53]:
authors_df.head()

Unnamed: 0,name,name_rus,texts_number,texts_percent
1,4th Earl of Chesterfield,,1,0.00038
2,A. A. Milne,Алан Александр Милн,3,0.001141
3,A. Michael Spence,,1,0.00038
4,AUREL BRAUN,,1,0.00038
5,Adalbert Stifter,Адальберт Штифтер,7,0.002663


In [0]:
authors_df.to_csv('authors.csv')

In [0]:
files.download('authors.csv')

### **Translators table**

In [0]:
value_counts_1 = texts_df['translator_id'].value_counts()
value_counts_2 = texts_df['second_translator_id'].value_counts()

id_counts_1 = [value_counts_1[x] if x in value_counts_1 else 0
               for x in range(1, 811)]
id_counts_2 = [value_counts_2[x] if x in value_counts_2 else 0
               for x in range(1, 811)]

id_counts_total = [x + y for x, y in zip(id_counts_1, id_counts_2)]

In [0]:
tran_data['texts_number'] = np.array(id_counts_total)

In [0]:
total = np.sum(tran_data['texts_number'])

tran_data['texts_percent'] = tran_data['texts_number'] / total

In [0]:
translators_df = pd.DataFrame.from_dict(tran_data)
translators_df.index = np.arange(1, len(translators_df) + 1)

In [66]:
translators_df.head()

Unnamed: 0,name,texts_number,texts_percent
1,Adrian Esa,1,0.000385
2,Adriano Dell'Asta,1,0.000385
3,Aina Rudzroga,2,0.00077
4,Alena Morávková,2,0.00077
5,Alessandro Lazzari,2,0.00077


In [0]:
translators_df.to_csv('translators.csv')

In [0]:
files.download('translators.csv')