In [1]:
import sys; sys.path += ['..', '../src']

Here we should preprocess Classics and News corpora:
* split each one into individual sentences;
* tokenize each one and replace named entities with special tokens;
* calculate, how much dictionaries diverge (I really hope that they are almost the same)
* learn joint BPEs
* apply learnt BPEs

In [6]:
!cat ../data/classics/*.txt >> ../data/generated/classics.txt

In [2]:
classics = open('../data/generated/classics.txt', encoding='utf-8').read().splitlines()
news = open('../data/news.2016.ru.shuffled', encoding='utf-8').read().splitlines()

In [3]:
import multiprocessing as mp

import nltk
from nltk import sent_tokenize

num_threads = 20
nltk.download('punkt')
        
with mp.Pool(processes=num_threads) as pool:
    # Unfortunately, we do not have russian punkt in nltk
    # Let's hope that english will do the job
    classics_sents = pool.map(sent_tokenize, classics)
    news_sents = pool.map(sent_tokenize, news)

[nltk_data] Downloading package punkt to /home/universome/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# sent_tokenize results are in inconvenient format. Let's fix this
print('Num sentences before:', len(classics), len(news))
classics = [s for l in classics_sents for s in (l if type(l) is list else [l])]
news = [s for l in news_sents for s in (l if type(l) is list else [l])]
print('Num sentences after :', len(classics), len(news))

Num sentences before: 311458 7159447
Num sentences after : 709647 7476194


In [5]:
with open('../data/generated/classics.split', 'w', encoding='utf-8') as out_f:
    for line in classics:
        out_f.write(line + '\n')
        
with open('../data/generated/news.ru.split', 'w', encoding='utf-8') as out_f:
    for line in news:
        out_f.write(line + '\n')

Ok, let's tokenize our data and find named entities

In [7]:
sentence = "Чавушоглу сказал , что РФ и раньше отрицала случай нарушения воздушного пространства Турции , когда был сбит истребитель Су-24 , и добавил , но последнее нарушение является зафиксированным фактом ."

In [18]:
chunks[6]

('раньше', 'ADV')

In [8]:
from tqdm import tqdm
from nltk import word_tokenize, pos_tag, ne_chunk

nltk.download('averaged_perceptron_tagger_ru')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def replace_nes(sentence):
    chunks = ne_chunk(pos_tag(word_tokenize(sentence), lang='rus'))
    tokens_with_nes = [chunk_to_token(chunk) for chunk in chunks]
    
    return ' '.join(tokens_with_nes)


def chunk_to_token(chunk):
    # This is the only way to detect NE of nltk result I'v found :|
    if hasattr(chunk, 'label'):
        return '__NE_' + chunk.label() + '__'
    else:
        return chunk[0]


# with mp.Pool(processes=num_threads) as pool:
#     # Cant use tqdm because it crashes the page
#     # classics = list(tqdm(pool.imap(replace_nes, classics), total=len(classics)))
#     # news = list(tqdm(pool.imap(replace_nes, news), total=len(news)))
#     classics = pool.map(replace_nes, classics)
#     news = pool.map(replace_nes, news)

[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     /home/universome/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_ru is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/universome/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/universome/nltk_data...
[nltk_data]   Package words is already up-to-date!


Let's save results

In [6]:
with open('../data/generated/classics.ner.tok', 'w', encoding='utf-8') as out_f:
    for line in classics:
        out_f.write(line + '\n')
        
with open('../data/generated/news.ru.ner.tok', 'w', encoding='utf-8') as out_f:
    for line in news:
        out_f.write(line + '\n')

Let's now see, how much tokens we share in the dictionary

In [7]:
from src.vocab import Vocab

news = open('../data/generated/news.ru.ner.tok', encoding='utf-8').read().splitlines()
classics = open('../data/generated/classics.ner.tok', encoding='utf-8').read().splitlines()

vocab_news = Vocab.from_sequences(news)
vocab_classics = Vocab.from_sequences(classics)

vocab_news = set(vocab_news.token2id.keys())
vocab_classics = set(vocab_classics.token2id.keys())

print('Size of news vocabulary', len(vocab_news))
print('Size of classics vocabulary', len(vocab_classics))
print('How much tokens intersect?', len(vocab_news.intersection(vocab_classics)))

Size of news vocabulary 1347682
Size of classics vocabulary 365282
How much tokens intersect? 187391


Now we can learn and apply BPEs.

In [8]:
%%bash

subword_nmt="../ext-libs/subword-nmt"
data_dir="../data"
generated_data_dir="$data_dir/generated"
data_src="$generated_data_dir/news.ru.ner.tok"
data_trg="$generated_data_dir/classics.ner.tok"

# We purposely set such low amount of BPEs
# so our model is more like char-rnn
num_bpes=1000

bpes="$generated_data_dir/news-classics.bpes"
vocab_src="$generated_data_dir/classics.vocab"
vocab_trg="$generated_data_dir/news.ru.vocab"

# Learning BPEs
python "$subword_nmt/learn_joint_bpe_and_vocab.py" --input $data_src $data_trg \
    -s $num_bpes -o $bpes --write-vocabulary $vocab_src $vocab_trg

# Let's apply bpe here for our tokenized files
python "$subword_nmt/apply_bpe.py" -c $bpes < $data_src > $data_src.bpe
python "$subword_nmt/apply_bpe.py" -c $bpes < $data_trg > $data_trg.bpe

Process is terminated.
