In [2]:
import os
from collections import Counter

In [3]:
corpus_folder_path = 'corpus/'
en_path = os.path.join(corpus_folder_path, 'news-commentary-v12.ru-en.en')
ru_path = os.path.join(corpus_folder_path, 'news-commentary-v12.ru-en.ru')

In [4]:
def read_corpus(corpus_path):
    ''' Loads En/Ru corpus from given path'''
    
    with open(corpus_path, "r") as f:
        return f.readlines()

english = read_corpus(en_path)
russian = read_corpus(ru_path)

In [5]:
print(f'Total number of sentences: EN: {len(english)}, RU: {len(russian)}')

Total number of sentences: EN: 222988, RU: 222732


In [6]:
en_examples = "\n  ".join(english[3:5])
ru_examples = "\n  ".join(russian[3:5])
print(f'Examples:\nEN:\n  {en_examples}\nRU:\n  {ru_examples}')

Examples:
EN:
  Wouldn’t you know it?

  Since their articles appeared, the price of gold has moved up still further.

RU:
  И что бы вы думали?

  С тех пор как вышли их статьи, стоимость золота повысилась еще больше.



In [7]:
def print_stats(corpus, language='English'):
    
    total_count = sum([len(sentence.split()) for sentence in corpus])
    counter = Counter([word for sentence in corpus for word in sentence.split()])
    avg_sentence_length = sum([len(sentence.split()) for sentence in corpus]) / len(corpus)
    
    print(f'Stats for {language} corpus')
    print(f'\tTotal number of tokens: {total_count}')
    print(f'\tNumber of unique tokens: {len(counter)}')
    print(f'\tAverage number of words per sentence: {avg_sentence_length}')
    print(f'\tMost frequent tokens: {[x[0] for x in counter.most_common(10)]}')

In [8]:
print_stats(english, language='English')
print_stats(russian, language='Russian')

Stats for English corpus
	Total number of tokens: 5066942
	Number of unique tokens: 182109
	Average number of words per sentence: 22.722935763359462
	Most frequent tokens: ['the', 'of', 'to', 'and', 'in', 'a', 'is', 'that', 'for', 'be']
Stats for Russian corpus
	Total number of tokens: 4759367
	Number of unique tokens: 313884
	Average number of words per sentence: 21.368133002891366
	Most frequent tokens: ['в', 'и', 'на', 'не', 'что', 'с', 'как', 'для', 'к', 'по']
