In [1]:
import joblib
import os
import itertools
import re
import unidecode
import string

import numpy as np
import pandas as pd

from zipfile import ZipFile
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Functions

In [2]:
def create_ngram_base(alphabet, n_order):
    assert type(alphabet) == list, 'Alphabet is not list.'
    assert len(set(alphabet)) == len(alphabet), 'Alphabet is not correct.'
    alphabet.sort()
    n_gram_base = set()
    for combination in itertools.combinations_with_replacement(alphabet, n_order):
        for permutation in itertools.permutations(combination, n_order):
            n_gram = ''
            for letter in permutation:
                n_gram += letter
            n_gram_base.add(n_gram)
    assert len(n_gram_base) == len(alphabet)**n_order, 'Incorrect result'
    print("Created", str(n_order)+"-gram base with", len(n_gram_base), str(n_order)+"-grams.")
    n_gram_base = list(n_gram_base)
    n_gram_base.sort()
    return n_gram_base

def alphabet():
    alphabet = list(string.ascii_lowercase)
    alphabet.append(' ')
    return alphabet

def process_sentenses(path_to_file):
    alph = alphabet()
    delimiters = "\t", "\n"
    regexPattern = '|'.join(map(re.escape, delimiters))
    regexPattern

    raw_lines = []
    with open(path_to_file, 'r', encoding="utf-8") as file:
        for line in file:
            raw_lines.append(line)
        
    lines_split = [re.split(regexPattern, line)[1].lower() for line in raw_lines]
    
    lines_final = []

    for i in tqdm(range(len(lines_split))):
        line = lines_split[i]
        line_ = [unidecode.unidecode(x) for x in line if x.isalpha() or x.isspace()]
        
        line_ = [x for x in ''.join(line_).split(' ') if x]
        new_line = []
        for word in line_:
            word = [x for x in word if x in alph]
            new_line.append(''.join(word).lower())
        lines_final.append(new_line)
    del delimiters, regexPattern, raw_lines, lines_split, line, line_
    return lines_final

def create_blocks(lines):    
    blocks = []
    current_block = ''
    
    for i in tqdm(range(len(lines))):
        line = ' '.join(lines[i])
        if len(current_block) < 1000:
            if len(current_block) > 0:
                if current_block[-1] != ' ': current_block += ' '
            current_block += line
        elif len(current_block) >= 1000:
            blocks.append(current_block[:1000])
            current_block = line
    del current_block, line
    return blocks

def base_26_latin_ngrams(n):
    alb = alphabet()
    base_ngrams = create_ngram_base(alb, n)
    del alb
    return base_ngrams

def create_rand_atom_vect(order=1000):
    atom = np.random.randint(-1, 1, order)
    atom = np.where(atom==0, 1, atom)
    atom.reshape(-1,1)
    return atom

def create_item_memory(order=1000, alphabet_size=27):
    item_memory = []
    for i in range(alphabet_size):
        item_memory.append(create_rand_atom_vect(order))
    item_memory = np.array(item_memory)
    return item_memory    

def create_fixed_permutations(order=1000, n=3):
    index = [i for i in range(order)]
    permutations = []
    for i in range(n):
        np.random.shuffle(index)
        permutations.append(index.copy())
    return permutations

def bind(hdv1, hdv2):
    assert type(hdv1) == np.ndarray and type(hdv2) == np.ndarray, 'Wrong HD vectors format'
    binding = hdv1*hdv2
    return binding
    
def permute(hdv, permutation, n_times_to_permute):
    assert type(hdv) == np.ndarray, 'Wrong HD vector format'
    assert type(permutation) == list, 'Wrong permutation format'
    assert len(hdv) == len(permutation), 'Inconsistent vector/permutation shape'
    for i in range(n_times_to_permute):
        hdv = hdv[permutation]
    return hdv
    
def bundle(hdv1, hdv2):
    assert type(hdv1) == np.ndarray and type(hdv2) == np.ndarray, 'Wrong HD vectors format'
    assert len(hdv1) == len(hdv2), 'Inconsistent vectors shape'
    bundling = hdv1 + hdv2
    return bundling

def measure_cosine_similarity(hdv1, hdv2):
    return(cosine_similarity([hdv1], [hdv2]))

def jobsave(object_to_save, filename):
    joblib.dump(object_to_save, filename)

def jobload(filename):
    return joblib.load(filename)

# Data

In [18]:
train_data_root = os.path.join('data', 'train')
data_lang_paths = os.listdir(train_data_root)

In [19]:
data_lang_paths

['bul_newscrawl_2017_1M',
 'ces_newscrawl_2019_1M',
 'dan_newscrawl_2019_1M',
 'deu_newscrawl-public_2018_1M',
 'ell_newscrawl_2017_1M',
 'eng_newscrawl-public_2018_1M',
 'est_newscrawl_2017_1M',
 'fin_newscrawl_2017_1M',
 'fra_newscrawl-public_2019_1M',
 'hun_newscrawl_2017_1M',
 'ita_newscrawl_2019_1M',
 'lav_newscrawl_2016_1M',
 'lit_newscrawl_2016_1M',
 'nld_newscrawl_2019_1M',
 'pol_newscrawl_2018_1M',
 'por_newscrawl_2018_1M',
 'ron_newscrawl_2015_1M',
 'slk_newscrawl_2016_1M',
 'slv_newscrawl_2016_1M',
 'spa_newscrawl_2015_1M',
 'swe_newscrawl_2018_1M']

In [20]:
lang_paths = defaultdict()
for path in data_lang_paths:
    lang_paths[path[:3]] = os.path.join(train_data_root, path, path+'-sentences.txt')

In [21]:
lang_paths

defaultdict(None,
            {'bul': 'data\\train\\bul_newscrawl_2017_1M\\bul_newscrawl_2017_1M-sentences.txt',
             'ces': 'data\\train\\ces_newscrawl_2019_1M\\ces_newscrawl_2019_1M-sentences.txt',
             'dan': 'data\\train\\dan_newscrawl_2019_1M\\dan_newscrawl_2019_1M-sentences.txt',
             'deu': 'data\\train\\deu_newscrawl-public_2018_1M\\deu_newscrawl-public_2018_1M-sentences.txt',
             'ell': 'data\\train\\ell_newscrawl_2017_1M\\ell_newscrawl_2017_1M-sentences.txt',
             'eng': 'data\\train\\eng_newscrawl-public_2018_1M\\eng_newscrawl-public_2018_1M-sentences.txt',
             'est': 'data\\train\\est_newscrawl_2017_1M\\est_newscrawl_2017_1M-sentences.txt',
             'fin': 'data\\train\\fin_newscrawl_2017_1M\\fin_newscrawl_2017_1M-sentences.txt',
             'fra': 'data\\train\\fra_newscrawl-public_2019_1M\\fra_newscrawl-public_2019_1M-sentences.txt',
             'hun': 'data\\train\\hun_newscrawl_2017_1M\\hun_newscrawl_2017_1M-senten

In [22]:
lines_raw = defaultdict()
for lang in lang_paths:
    lines_raw[lang] = process_sentenses(lang_paths[lang])

100%|██████████████████████████████████████████████████████████████████████| 1000000/1000000 [02:31<00:00, 6591.01it/s]
100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:05<00:00, 15349.04it/s]
100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:12<00:00, 13768.55it/s]
100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:13<00:00, 13623.23it/s]
100%|██████████████████████████████████████████████████████████████████████| 1000000/1000000 [03:03<00:00, 5450.21it/s]
100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:15<00:00, 13277.22it/s]
100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:20<00:00, 12476.27it/s]
100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:12<00:00, 13789.46it/s]
100%|███████████████████████████████████

In [23]:
lang_blocks = defaultdict()
for lang in lines_raw:
    lang_blocks[lang] = create_blocks(lines_raw[lang])

100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:02<00:00, 354724.23it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:01<00:00, 602374.74it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:02<00:00, 405260.90it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:02<00:00, 376549.20it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:03<00:00, 298094.32it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:02<00:00, 388254.96it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:01<00:00, 570713.24it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:02<00:00, 437235.69it/s]
100%|███████████████████████████████████

In [24]:
# save the data
fname = 'lang_blocks_train.jbl'
jobsave(lang_blocks, fname)

In [25]:
# load the data
fname = 'lang_blocks_train.jbl'
lang_blocks_train = jobload(fname)

# Experiment 1

# 1. Encode train data using HD vectors, 3-grams and 21 languages

## 1.1 create latin alphabet and item memory using 27 atom HD binary (-1, 1) vectors with size 1000

In [21]:
# create 26 letter + 1 space symbol alphabet:
latin_alphabet = alphabet()

In [22]:
# create item memory
item_memory = create_item_memory(order=1000, alphabet_size=27)

# check the shape; sum of item memory can show if some bias exist in random destribution
item_memory.shape, np.sum(item_memory)

((27, 1000), -90)

## 1.2 create fixed permutations 

In [30]:
fixed_permutations = create_fixed_permutations(order=1000, n=100)

## 1.3 encode hd vector representations for all possible 3-grams

In [24]:
base_ngrams = base_26_latin_ngrams(3)

Created 3-gram base with 19683 3-grams.


In [25]:
latin_alphabet = alphabet()

In [26]:
print(base_ngrams[1000])
for letter in base_ngrams[1000]:
    ind = latin_alphabet.index(letter)
    print(ind)
    print(item_memory[ind].shape)

aja
0
(1000,)
9
(1000,)
0
(1000,)


In [33]:
def encode_ngram(ngram):
    vectors = []
    for order in range(len(ngram)):
        letter = ngram[order]
        item_id = latin_alphabet.index(letter)
        
        # this is atom vector for that simbol
        item_itself = item_memory[item_id]
        permutation_n = fixed_permutations[order]
        permuted_hdv = permute(item_itself, permutation_n, order)
        vectors.append(permuted_hdv)
    output_vector = vectors[0]
    for hdv in vectors[1:]:
        output_vector = bind(output_vector, hdv)
    return output_vector

In [63]:
encoded_trigrams = defaultdict()
for trigram in base_ngrams:
    encoded_trigrams[trigram] =encode_ngram(trigram)

## Saving the intermediate results

In [67]:
# save the results
jobsave(item_memory, 'item_memory.jbl')
jobsave(fixed_permutations, 'fixed_permutations.jbl')
jobsave(base_ngrams, 'base_ngrams.jbl')
jobsave(encoded_trigrams, 'encoded_trigrams.jbl')

In [26]:
# load the results
item_memory = jobload('item_memory.jbl')
fixed_permutations = jobload('fixed_permutations.jbl')
base_ngrams = jobload('base_ngrams.jbl')
encoded_trigrams = jobload('encoded_trigrams.jbl')

# 1.4 Unsing encoded n-grams, encode the training data blocks

In [27]:
def parse_encode_text_block(data_block, encoded_trigrams):

    tri_grams = [data_block[i:i+3] for i in range(len(data_block)-2)]
    encoded_block_trigrams = []
    for tg in tri_grams:
        encoded_block_trigrams.append(encoded_trigrams[tg.lower()])
    
    enc_tg_start = encoded_block_trigrams[0]
    for enc_tg in encoded_block_trigrams:
        enc_tg_start = bundle(enc_tg_start, enc_tg)
    return enc_tg_start

In [28]:
encoded_lang_blocks_train = defaultdict()
for language in lang_blocks_train:
    print(language)
    encoded_list = []
    for b in tqdm(range(len(lang_blocks_train[language]))):
        #print(b)
        block = lang_blocks_train[language][b]
        encoded_list.append(parse_encode_text_block(block, encoded_trigrams))
    encoded_lang_blocks_train[language] = encoded_list

  0%|                                                                              | 59/99553 [00:00<02:51, 580.12it/s]

bul


100%|███████████████████████████████████████████████████████████████████████████| 99553/99553 [03:24<00:00, 486.47it/s]
  0%|                                                                             | 106/78026 [00:00<02:29, 520.34it/s]

ces


100%|███████████████████████████████████████████████████████████████████████████| 78026/78026 [02:25<00:00, 538.00it/s]
  0%|                                                                             | 111/96993 [00:00<02:56, 548.27it/s]

dan


100%|███████████████████████████████████████████████████████████████████████████| 96993/96993 [02:57<00:00, 545.05it/s]
  0%|                                                                             | 109/96488 [00:00<02:59, 538.37it/s]

deu


100%|███████████████████████████████████████████████████████████████████████████| 96488/96488 [02:56<00:00, 548.02it/s]
  0%|                                                                             | 57/116989 [00:00<03:28, 560.45it/s]

ell


100%|█████████████████████████████████████████████████████████████████████████| 116989/116989 [03:30<00:00, 556.41it/s]
  0%|                                                                             | 53/103681 [00:00<03:17, 523.68it/s]

eng


100%|█████████████████████████████████████████████████████████████████████████| 103681/103681 [03:35<00:00, 482.22it/s]
  0%|                                                                             | 48/101016 [00:00<03:33, 472.07it/s]

est


100%|█████████████████████████████████████████████████████████████████████████| 101016/101016 [03:31<00:00, 476.63it/s]
  0%|                                                                              | 48/93543 [00:00<03:16, 476.61it/s]

fin


100%|███████████████████████████████████████████████████████████████████████████| 93543/93543 [03:14<00:00, 480.46it/s]
  0%|                                                                            | 110/105407 [00:00<03:11, 548.98it/s]

fra


100%|█████████████████████████████████████████████████████████████████████████| 105407/105407 [03:36<00:00, 487.68it/s]
  0%|                                                                             | 93/113367 [00:00<04:08, 455.93it/s]

hun


100%|█████████████████████████████████████████████████████████████████████████| 113367/113367 [04:00<00:00, 470.46it/s]
  0%|                                                                             | 50/112002 [00:00<03:48, 489.35it/s]

ita


100%|█████████████████████████████████████████████████████████████████████████| 112002/112002 [03:57<00:00, 471.16it/s]
  0%|                                                                             | 43/100529 [00:00<03:55, 426.94it/s]

lav


100%|█████████████████████████████████████████████████████████████████████████| 100529/100529 [03:35<00:00, 465.67it/s]
  0%|                                                                             | 48/100053 [00:00<03:28, 479.08it/s]

lit


100%|█████████████████████████████████████████████████████████████████████████| 100053/100053 [03:20<00:00, 500.21it/s]
  0%|                                                                             | 113/86687 [00:00<02:36, 552.86it/s]

nld


100%|███████████████████████████████████████████████████████████████████████████| 86687/86687 [02:40<00:00, 540.83it/s]
  0%|                                                                              | 53/93810 [00:00<02:58, 526.37it/s]

pol


100%|███████████████████████████████████████████████████████████████████████████| 93810/93810 [02:55<00:00, 535.62it/s]
  0%|                                                                            | 114/106316 [00:00<03:08, 562.22it/s]

por


100%|█████████████████████████████████████████████████████████████████████████| 106316/106316 [03:16<00:00, 541.68it/s]
  0%|                                                                             | 54/111425 [00:00<03:26, 538.85it/s]

ron


100%|█████████████████████████████████████████████████████████████████████████| 111425/111425 [03:28<00:00, 535.07it/s]
  0%|                                                                             | 105/97539 [00:00<03:08, 516.84it/s]

slk


100%|███████████████████████████████████████████████████████████████████████████| 97539/97539 [03:08<00:00, 516.15it/s]
  0%|                                                                            | 108/101224 [00:00<03:07, 538.53it/s]

slv


100%|█████████████████████████████████████████████████████████████████████████| 101224/101224 [03:32<00:00, 476.60it/s]
  0%|                                                                             | 46/122227 [00:00<04:28, 454.49it/s]

spa


100%|█████████████████████████████████████████████████████████████████████████| 122227/122227 [04:08<00:00, 492.75it/s]
  0%|                                                                             | 106/85730 [00:00<02:40, 532.71it/s]

swe


100%|███████████████████████████████████████████████████████████████████████████| 85730/85730 [02:42<00:00, 529.13it/s]


In [29]:
jobsave(encoded_lang_blocks_train, 'encoded_lang_blocks_train.jbl')

In [31]:
encoded_lang_blocks_train['bul'][1052]

array([-135,   89,  -39,   45,  -83,  -69, -125,   53,   11,  -63,   21,
         -9,   59,   -9,  -15,  -29,   35,  -91, -119,    9,  101,   65,
        -13,   -3,    7,  -99,   77,  -29,   75,   63,   27,  -11,   39,
         49, -137,  -49,  -47,   29, -105,    9,   45,  -61,  163,   35,
          1,   -9,  -17,    5,   39,  -41,    5,  -29,  -27,   51,  147,
         -1,  -51,   77,  101,    9,   67,  103,  -85,   71,   67,  137,
         33,   35,  -61,    7,    5,   31,   65,   -9,    7,  -57,  -29,
        -15, -105,    1,  -41,  -53, -119,  -51,  -73,  -27,   -9,  -75,
         47,  -75,  -15,  -85,  125,  -93,  -19,  -49,  -43,  -11,   -5,
        -23,   33,   -1,  105,   21,   55,   -1,  107,  -81,  -15,   37,
        -89,    1,    7,   47,   71,   41,   47,    1,  -75,  141,  -73,
        -47,   55,  -59,  -91,  -59,   -5,  -31, -107, -139,  -55,  -93,
         81,  -45,   73,   35,  117,    5,   55,  -45,   55,   11,   87,
         29,   -9,  135,   -9,   43,   -5,  109,  -

# Create test data

In [96]:
def process_sentenses_eurpar(path_to_file):
    alph = alphabet()
    
    raw_lines = []
    
    with open(path_to_file, 'r', encoding="utf-8") as file:
        for line in file:
            raw_lines.append(unidecode.unidecode(line[:-1].lower()))
    
    lines_final = []
    for i in tqdm(range(len(raw_lines))):
        line = raw_lines[i]
        line = [x for x in ''.join(line).split(' ') if x]
        new_line = []
        for word in line:
            word = [x for x in word if x in alph]
            new_line.append(''.join(word).lower())
        lines_final.append(new_line)
    lines_final = [x for x in lines_final if x]
    lines_final = [x for x in lines_final if x != ['']]
    del raw_lines, line
    return lines_final

def create_blocks_eurpar(lines):    
    blocks = []
    for i in tqdm(range(len(lines))):
        line = ' '.join(lines[i])
        blocks.append(line)
    del line
    return blocks

In [97]:
test_data_root = os.path.join('data', 'test')
data_lang_paths = os.listdir(test_data_root)

In [98]:
data_lang_paths

['bg-en',
 'cs-en',
 'da-en',
 'de-en',
 'el-en',
 'en-en',
 'es-en',
 'et-en',
 'fi-en',
 'fr-en',
 'hu-en',
 'it-en',
 'lt-en',
 'lv-en',
 'pl-en',
 'ro-en']

In [99]:
lang_paths_test = defaultdict()
for path in data_lang_paths:
    lang_paths_test[path[:2]] = os.path.join(test_data_root, path, 'europarl-v7.'+path+'.'+path[:2])

In [100]:
lang_paths_test

defaultdict(None,
            {'bg': 'data\\test\\bg-en\\europarl-v7.bg-en.bg',
             'cs': 'data\\test\\cs-en\\europarl-v7.cs-en.cs',
             'da': 'data\\test\\da-en\\europarl-v7.da-en.da',
             'de': 'data\\test\\de-en\\europarl-v7.de-en.de',
             'el': 'data\\test\\el-en\\europarl-v7.el-en.el',
             'en': 'data\\test\\en-en\\europarl-v7.en-en.en',
             'es': 'data\\test\\es-en\\europarl-v7.es-en.es',
             'et': 'data\\test\\et-en\\europarl-v7.et-en.et',
             'fi': 'data\\test\\fi-en\\europarl-v7.fi-en.fi',
             'fr': 'data\\test\\fr-en\\europarl-v7.fr-en.fr',
             'hu': 'data\\test\\hu-en\\europarl-v7.hu-en.hu',
             'it': 'data\\test\\it-en\\europarl-v7.it-en.it',
             'lt': 'data\\test\\lt-en\\europarl-v7.lt-en.lt',
             'lv': 'data\\test\\lv-en\\europarl-v7.lv-en.lv',
             'pl': 'data\\test\\pl-en\\europarl-v7.pl-en.pl',
             'ro': 'data\\test\\ro-en\\europarl-v7.r

In [None]:
lines_raw_test = defaultdict()
for lang in lang_paths_test:
    lines_raw_test[lang] = process_sentenses_eurpar(lang_paths_test[lang])

100%|███████████████████████████████████████████████████████████████████████| 406934/406934 [00:13<00:00, 29205.18it/s]
100%|███████████████████████████████████████████████████████████████████████| 646605/646605 [00:19<00:00, 33966.53it/s]
 82%|████████████████████████████████████████████████████████▉            | 1623299/1968800 [00:49<00:11, 31073.75it/s]

In [None]:
lang_blocks_test = defaultdict()
for lang in lines_raw_test:
    lang_blocks_test[lang] = create_blocks_eurpar(lines_raw_test[lang])

In [None]:
encoded_lang_blocks_test = defaultdict()
for language in lang_blocks_test:
    print(language)
    encoded_list = []
    for b in tqdm(range(len(lang_blocks_test[language]))):
        #print(b)
        block = lang_blocks_test[language][b]
        encoded_list.append(parse_encode_text_block(block, encoded_trigrams))
    encoded_lang_blocks_test[language] = encoded_list

In [None]:
# save the results
jobsave(encoded_lang_blocks_test, 'encoded_lang_blocks_test.jbl')

In [3]:
# Load and run experiment

In [None]:
encoded_lang_blocks_train = jobload('encoded_lang_blocks_train.jbl')