In [25]:
import os
from collections import Counter

In [3]:
corpus_folder_path = 'corpus/'
en_path = os.path.join(corpus_folder_path, 'corpus.en_ru.1m.en')
ru_path = os.path.join(corpus_folder_path, 'corpus.en_ru.1m.ru')

In [7]:
def read_corpus(corpus_path):
    ''' Loads En/Ru corpus from given path'''
    
    with open(corpus_path, "r") as f:
        return f.readlines()

english = read_corpus(en_path)
russian = read_corpus(ru_path)

In [8]:
print(f'Total number of sentences: EN: {len(english)}, RU: {len(russian)}')

Total number of sentences: EN: 1000000, RU: 1000000


In [22]:
en_examples = "\n  ".join(english[3:5])
ru_examples = "\n  ".join(russian[3:5])
print(f'Examples:\nEN:\n  {en_examples}\nRU:\n  {ru_examples}')

Examples:
EN:
  Now you have Black Sabbath and Kiss tribute albums.

  I was the one who sat down and copied them.

RU:
  А сейчас куча триьютов тем же самым BLACK SABBATH и KISS.

  Я был единственным, кто занялся копированием демо на кассете.



In [34]:
def print_stats(corpus, language='English'):
    
    total_count = sum([len(sentence.split()) for sentence in corpus])
    counter = Counter([word for sentence in corpus for word in sentence.split()])
    avg_sentence_length = sum([len(sentence.split()) for sentence in corpus]) / len(corpus)
    
    print(f'Stats for {language} corpus')
    print(f'\tTotal number of tokens: {total_count}')
    print(f'\tNumber of unique tokens: {len(counter)}')
    print(f'\tAverage number of words per sentence: {avg_sentence_length}')
    print(f'\tMost frequent tokens: {[x[0] for x in counter.most_common(10)]}')

In [36]:
print_stats(english, language='English')
print_stats(russian, language='Russian')

Stats for English corpus
	Total number of tokens: 21252975
	Number of unique tokens: 796290
	Average number of words per sentence: 21.252975
	Most frequent tokens: ['the', 'of', 'and', 'to', 'in', 'a', 'is', 'for', 'that', 'with']
Stats for Russian corpus
	Total number of tokens: 18680351
	Number of unique tokens: 1323932
	Average number of words per sentence: 18.680351
	Most frequent tokens: ['и', 'в', 'на', 'с', 'не', 'что', '-', 'для', 'по', 'к']
