In [6]:
import joblib
import os
import itertools
import re
import unidecode
import string
import cmath

import numpy as np
import pandas as pd

from zipfile import ZipFile
from collections import defaultdict
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Functions

In [7]:
def create_ngram_base(alphabet, n_order):
    assert type(alphabet) == list, 'Alphabet is not list.'
    assert len(set(alphabet)) == len(alphabet), 'Alphabet is not correct.'
    alphabet.sort()
    n_gram_base = set()
    for combination in itertools.combinations_with_replacement(alphabet, n_order):
        for permutation in itertools.permutations(combination, n_order):
            n_gram = ''
            for letter in permutation:
                n_gram += letter
            n_gram_base.add(n_gram)
    assert len(n_gram_base) == len(alphabet)**n_order, 'Incorrect result'
    print("Created", str(n_order)+"-gram base with", len(n_gram_base), str(n_order)+"-grams.")
    n_gram_base = list(n_gram_base)
    n_gram_base.sort()
    return n_gram_base

def alphabet():
    alphabet = list(string.ascii_lowercase)
    alphabet.append(' ')
    return alphabet

def process_sentenses(path_to_file):
    alph = alphabet()
    delimiters = "\t", "\n"
    regexPattern = '|'.join(map(re.escape, delimiters))
    regexPattern

    raw_lines = []
    with open(path_to_file, 'r', encoding="utf-8") as file:
        for line in file:
            raw_lines.append(line)
        
    lines_split = [re.split(regexPattern, line)[1].lower() for line in raw_lines]
    
    lines_final = []

    for i in tqdm(range(len(lines_split))):
        line = lines_split[i]
        line_ = [unidecode.unidecode(x) for x in line if x.isalpha() or x.isspace()]
        
        line_ = [x for x in ''.join(line_).split(' ') if x]
        new_line = []
        for word in line_:
            word = [x for x in word if x in alph]
            new_line.append(''.join(word).lower())
        lines_final.append(new_line)
    del delimiters, regexPattern, raw_lines, lines_split, line, line_
    return lines_final

def create_blocks(lines):    
    blocks = []
    current_block = ''
    
    for i in tqdm(range(len(lines))):
        line = ' '.join(lines[i])
        if len(current_block) < 1000:
            if len(current_block) > 0:
                if current_block[-1] != ' ': current_block += ' '
            current_block += line
        elif len(current_block) >= 1000:
            blocks.append(current_block[:1000])
            current_block = line
    del current_block, line
    return blocks

def base_26_latin_ngrams(n):
    alb = alphabet()
    base_ngrams = create_ngram_base(alb, n)
    del alb
    return base_ngrams

def create_rand_atom_vect(order=1000):
    atom = np.random.randint(-1, 1, order)
    atom = np.where(atom==0, 1, atom)
    atom.reshape(-1,1)
    return atom

def create_item_memory(order=1000, alphabet_size=27):
    item_memory = []
    for i in range(alphabet_size):
        item_memory.append(create_rand_atom_vect(order))
    item_memory = np.array(item_memory)
    return item_memory    

def create_fixed_permutations(order=1000, n=3):
    index = [i for i in range(order)]
    permutations = []
    for i in range(n):
        np.random.shuffle(index)
        permutations.append(index.copy())
    return permutations

def bind(hdv1, hdv2):
    binding = hdv1*hdv2
    return binding
    
def permute(hdv, permutation, n_times_to_permute):
    for i in range(n_times_to_permute):
        hdv = hdv[permutation]
    return hdv
    
def bundle(hdv1, hdv2):
    bundling = hdv1 + hdv2
    return bundling

def measure_cosine_similarity(hdv1, hdv2):
    return(cosine_similarity([hdv1], [hdv2]))

def jobsave(object_to_save, filename):
    joblib.dump(object_to_save, filename)

def jobload(filename):
    return joblib.load(filename)

# Data

In [None]:
train_data_root = os.path.join('data', 'train')
data_lang_paths = os.listdir(train_data_root)

In [None]:
data_lang_paths

In [None]:
lang_paths = defaultdict()
for path in data_lang_paths:
    lang_paths[path[:3]] = os.path.join(train_data_root, path, path+'-sentences.txt')

In [None]:
lang_paths

In [None]:
lines_raw = defaultdict()
for lang in lang_paths:
    lines_raw[lang] = process_sentenses(lang_paths[lang])

In [None]:
lang_blocks = defaultdict()
for lang in lines_raw:
    lang_blocks[lang] = create_blocks(lines_raw[lang])

In [None]:
# save the data
fname = 'lang_blocks_train.jbl'
jobsave(lang_blocks, fname)

In [None]:
# load the data
fname = 'lang_blocks_train.jbl'
lang_blocks_train = jobload(fname)

# Experiment 1

# 1. Encode train data using HD vectors, 3-grams and 21 languages

## 1.1 create latin alphabet and item memory using 27 atom HD binary (-1, 1) vectors with size 1000

In [3]:
# create 26 letter + 1 space symbol alphabet:
latin_alphabet = alphabet()

In [None]:
# create item memory
item_memory = create_item_memory(order=1000, alphabet_size=27)

# check the shape; sum of item memory can show if some bias exist in random destribution
item_memory.shape, np.sum(item_memory)

## 1.2 create fixed permutations 

In [None]:
fixed_permutations = create_fixed_permutations(order=1000, n=3)

## 1.3 encode hd vector representations for all possible 3-grams

In [None]:
base_ngrams = base_26_latin_ngrams(3)

In [None]:
def encode_ngram(ngram):
    vectors = []
    for order in range(len(ngram)):
        letter = ngram[order]
        item_id = latin_alphabet.index(letter)
        
        # this is atom vector for that simbol
        item_itself = item_memory[item_id]
        permutation_n = fixed_permutations[order]
        permuted_hdv = permute(item_itself, permutation_n, 1)
        vectors.append(permuted_hdv)
    output_vector = vectors[0]
    for hdv in vectors[1:]:
        output_vector = bind(output_vector, hdv)
    return output_vector

In [None]:
encoded_trigrams = defaultdict()
for trigram in base_ngrams:
    encoded_trigrams[trigram] = encode_ngram(trigram)

## Saving the intermediate results

In [None]:
# save the results
jobsave(item_memory, 'item_memory.jbl')
jobsave(fixed_permutations, 'fixed_permutations.jbl')
jobsave(base_ngrams, 'base_ngrams.jbl')
jobsave(encoded_trigrams, 'encoded_trigrams.jbl')

In [None]:
# load the results
item_memory = jobload('item_memory.jbl')
fixed_permutations = jobload('fixed_permutations.jbl')
base_ngrams = jobload('base_ngrams.jbl')
encoded_trigrams = jobload('encoded_trigrams.jbl')

# 1.4 Unsing encoded n-grams, encode the training data blocks

In [10]:
def parse_encode_text_block(data_block, encoded_trigrams):

    tri_grams = [data_block[i:i+3] for i in range(len(data_block)-2)]
    encoded_block_trigrams = []
    for tg in tri_grams:
        encoded_block_trigrams.append(encoded_trigrams[tg.lower()])
    
    enc_tg_start = encoded_block_trigrams[0]
    for enc_tg in encoded_block_trigrams:
        enc_tg_start = bundle(enc_tg_start, enc_tg)
    return enc_tg_start

In [None]:
encoded_lang_blocks_train = defaultdict()
for language in lang_blocks_train:
    print(language)
    encoded_list = parse_encode_text_block(lang_blocks_train[language][0], encoded_trigrams)
    for b in tqdm(range(1, len(lang_blocks_train[language]))):
        #print(b)
        block = lang_blocks_train[language][b]
        encoded_list = bundle(encoded_list , parse_encode_text_block(block, encoded_trigrams))
    encoded_lang_blocks_train[language] = encoded_list

In [None]:
jobsave(encoded_lang_blocks_train, 'encoded_lang_blocks_train.jbl')

In [None]:
encoded_lang_blocks_train['bul'][1052]

# Create test data

In [None]:
def process_sentenses_eurpar(path_to_file):
    alph = alphabet()
    
    _lines = []
    
    with open(path_to_file, 'r', encoding="utf-8") as file:
        for line in file:
            new_line = ''.join([x for x in unidecode.unidecode(line[:-1].lower()) if x in latin_alphabet])
            if len(new_line) > 0:
                _lines.append(' '.join(new_line.split()))
                
    return _lines

def process_sentenses_nn(path_to_file, nn):
    alph = alphabet()
    
    _lines = []
    
    with open(path_to_file, 'r', encoding="utf-8") as file:
        counter = 0
        for line in file:
            if counter <nn+1:
                new_line = ''.join([x for x in unidecode.unidecode(line[:-1].lower()) if x in latin_alphabet])
                if len(new_line) > 0:
                    _lines.append(' '.join(new_line.split()))
                    counter+=1
            else: 
                break

    return _lines

def create_blocks_eurpar(lines):    
    blocks = []
    for i in tqdm(range(len(lines))):
        line = ' '.join(lines[i])
        blocks.append(line)
    del line
    return blocks

In [None]:
test_data_root = os.path.join('data', 'test')
data_lang_paths = os.listdir(test_data_root)

In [None]:
data_lang_paths

In [None]:
lang_paths_test = defaultdict()
for path in data_lang_paths:
    lang_paths_test[path[:2]] = os.path.join(test_data_root, path, 'europarl-v7.'+path+'.'+path[:2])

In [None]:
lang_paths_test

In [None]:
lines_test = defaultdict()
for lang in lang_paths_test:
    print(lang)
    lines_test[lang] = process_sentenses_nn(lang_paths_test[lang], 3000)
    
jobsave(lines_test, 'lines_test_nn.jbl')

In [None]:
lines_test = defaultdict()
for lang in lang_paths_test:
    print(lang)
    lines_test[lang] = process_sentenses_eurpar(lang_paths_test[lang])
    
jobsave(lines_test, 'lines_test_1000.jbl')

In [None]:
lines_test = jobload('lines_test.jbl')
lines_test_2 = jobload('lines_test_2.jbl')

In [None]:
def parse_encode_text_block(data_block, encoded_trigrams):

    tri_grams = [data_block[i:i+3] for i in range(len(data_block)-2)]
    encoded_block_trigrams = []
    for tg in tri_grams:
        encoded_block_trigrams.append(encoded_trigrams[tg])
    
    enc_tg_start = encoded_block_trigrams[0]
    for enc_tg in encoded_block_trigrams:
        enc_tg_start = bundle(enc_tg_start, enc_tg)
    return enc_tg_start

In [None]:
normalize([aaa], norm='l2')

In [None]:
lines_test_nn = jobload('lines_test_nn.jbl')

In [None]:
encoded_trigrams['aaa']

In [None]:
#encoded_blocks_test = defaultdict()
for language in lines_test_nn:
    print(language)
    encoded_list = []
    for bl in tqdm(range(len(lines_test_nn[language]))):
        block = lines_test[language][bl]
        if len(block) >= 3:
            encoded_list.append(normalize([parse_encode_text_block(block, encoded_trigrams)], norm='l2'))
    root = 'encoded_langs_test_2'
    filename = language +'_encoded_test.jbl'
    path = os.path.join(root, filename)
    jobsave(encoded_list, path)
    del encoded_list

In [None]:
#encoded_blocks_test = defaultdict()
for language in lines_test_2:
    print(language)
    encoded_list = []
    for bl in tqdm(range(len(lines_test_2[language]))):
        #if bl%10000 == 0:
        #    print(language, ':', bl)
        block = lines_test_2[language][bl]
        if len(block) >= 3:
            encoded_list.append(normalize([parse_encode_text_block(block, encoded_trigrams)], norm='l2'))
    filename = language +'_encoded_test.jbl'
    jobsave(encoded_list, filename)
    del encoded_list

In [None]:
# save the results
jobsave(encoded_blocks_test, 'encoded_blocks_test.jbl')

In [None]:
encoded_lang_blocks_train = jobload('encoded_lang_blocks_train.jbl')

In [None]:
train_keys = {
    'bul' : 'Bulgarian',
    'ces' : 'Czech',
    'dan' : 'Danish',
    'deu' : 'German',
    'ell' : 'Greek',
    'eng' : 'English',
    'est' : 'Estonian',
    'fin' : 'Finnish',
    'fra' : 'French',
    'hun' : 'Hungarian',
    'ita' : 'Italian',
    'lav' : 'Latvian',
    'lit' : 'Lithuanian',
    'nld' : 'Dutch',
    'pol' : 'Polish',
    'por' : 'Portuguese',
    'ron' : 'Romanian',
    'slk' : 'Slovak',
    'slv' : 'Slovene',
    'spa' : 'Spanish',
    'swe' : 'Swedish'
}

In [None]:
nor_lang_train = defaultdict()
for key in encoded_lang_blocks_train:
    new_key = train_keys[key]
    norm_vect = normalize([encoded_lang_blocks_train[key]], norm='l2')
    nor_lang_train[new_key] = norm_vect

In [None]:
jobsave(nor_lang_train, 'nor_lang_train.jbl')

# complex vectors

In [None]:
lines_test = jobload('lines_test.jbl')
lines_test_2 = jobload('lines_test_2.jbl')

In [17]:
def img_counterpart(real):
    img_pos = (1- real**2)**0.5
    
    return img_pos

def create_complex_range_u1(decimals):
    real_range = np.linspace(-1, 1, (10**decimals)*2+1)
    complex_range = []
    for x in real_range:
        y = img_counterpart(x)
        if y != 0:
            complex_range.append(complex(x, y))
            complex_range.append(complex(x, -y))
        elif y ==0:
            complex_range.append(complex(x, y))
    complex_range = np.array(complex_range)
    complex_range.sort()
    return complex_range

def create_random_complex_vector(source, order = 1000):
    atom_vector = np.random.choice(source, order)
    return atom_vector

def bind_comp(hdv1, hdv2):
    binding = hdv1*hdv2
    return binding
    
def permute_comp(hdv, place):
    np.roll(hdv, place)
    return hdv
    
def bundle_comp(hdv1, hdv2):
    bundling = hdv1 + hdv2
    return bundling

def comp_cos_sim(hdv1, hdv2):
    return(cosine_similarity([hdv1.real], [hdv2.real]))

def create_comp_item_memory(order=1000, alphabet_size=27):
    item_memory = []
    for i in range(alphabet_size):
        item_memory.append(create_random_complex_vector(complex_source, order))
    item_memory = np.array(item_memory)
    return item_memory    

In [None]:
complex_source = create_complex_range_u1(6)
len(complex_source)

# Experiment 2

### prepare complex data

In [None]:
latin_alphabet = alphabet()

In [18]:
base_ngrams = base_26_latin_ngrams(3)

Created 3-gram base with 19683 3-grams.


In [None]:
complex_source = create_complex_range_u1(6)

In [None]:
comp_item_memory = create_comp_item_memory(order=1000)

In [15]:
def encode_comp_ngram(ngram):
    vectors = []
    for order in range(len(ngram)):
        letter = ngram[order]
        item_id = latin_alphabet.index(letter)

        item_itself = comp_item_memory[item_id]

        permuted_hdv = permute_comp(item_itself, order)
        vectors.append(permuted_hdv)
    output_vector = vectors[0]
    for hdv in vectors[1:]:
        output_vector = bind(output_vector, hdv)
    return output_vector

In [19]:
comp_encoded_trigrams = defaultdict()
for trigram in base_ngrams:
    comp_encoded_trigrams[trigram] = encode_comp_ngram(trigram)

In [20]:
jobsave(comp_encoded_trigrams, 'comp_encoded_trigrams.jbl')

In [None]:
# save the results
jobsave(comp_item_memory, 'comp_item_memory.jbl')
jobsave(comp_encoded_trigrams, 'comp_encoded_trigrams.jbl')

In [12]:
# load the results
comp_item_memory = jobload('comp_item_memory.jbl')
base_ngrams = jobload('base_ngrams.jbl')
comp_encoded_trigrams = jobload('encoded_trigrams.jbl')

In [None]:
lang_blocks_train = jobload('lang_blocks_train.jbl')

### encode text blocks

In [None]:
def parse_encode_text_block(data_block, encoded_trigrams):

    tri_grams = [data_block[i:i+3] for i in range(len(data_block)-2)]
    encoded_block_trigrams = []
    for tg in tri_grams:
        encoded_block_trigrams.append(encoded_trigrams[tg.lower()])
    
    enc_tg_start = encoded_block_trigrams[0]
    for enc_tg in encoded_block_trigrams:
        enc_tg_start = bundle(enc_tg_start, enc_tg)
    return enc_tg_start

In [None]:
comp_encoded_lang_blocks_train = defaultdict()
for language in lang_blocks_train:
    new_language = train_keys[language]
    print(language)
    print(new_language)
    encoded_list = parse_encode_text_block(lang_blocks_train[language][0], comp_encoded_trigrams)
    for b in tqdm(range(1, len(lang_blocks_train[language]))):
        #print(b)
        block = lang_blocks_train[language][b]
        encoded_list = bundle(encoded_list , parse_encode_text_block(block, comp_encoded_trigrams))
    comp_encoded_lang_blocks_train[new_language] = encoded_list

In [None]:
jobsave(comp_encoded_lang_blocks_train, 'comp_encoded_lang_blocks_train.jbl')

# create complex test data

In [8]:
lines_test = jobload('lines_test.jbl')
lines_test_2 = jobload('lines_test_2.jbl')

In [None]:
comp_encoded_trigrams = jobload('comp_encoded_trigrams.jbl')

In [None]:
#encoded_blocks_test = defaultdict()
for language in lines_test_2:
    print(language)
    encoded_list = []
    for bl in tqdm(range(len(lines_test_2[language]))):

        block = lines_test_2[language][bl]
        if len(block) >= 3:
            encoded_list.append(parse_encode_text_block(block, comp_encoded_trigrams))
    filename = language +'comp_encoded_test.jbl'
    root = 'comp_encoded_langs_test'
    jobsave(encoded_list, os.path.join(root, filename))
    del encoded_list

  0%|          | 403/1990304 [00:00<16:47, 1975.45it/s]

nl


100%|██████████| 1990304/1990304 [16:05<00:00, 2062.09it/s]
  0%|          | 16/637649 [00:00<1:08:49, 154.41it/s]

sk


100%|██████████| 637649/637649 [03:57<00:00, 2684.34it/s]
  0%|          | 1389/620457 [00:00<01:27, 7109.57it/s]

sl


100%|██████████| 620457/620457 [03:41<00:00, 2803.81it/s]
  0%|          | 385/1852652 [00:00<18:47, 1643.28it/s]

sv


  6%|▌         | 113500/1852652 [00:41<11:03, 2621.71it/s]

In [24]:
#encoded_blocks_test = defaultdict()
for language in lines_test:
    print(language)
    encoded_list = []
    for bl in tqdm(range(len(lines_test[language]))):

        block = lines_test[language][bl]
        if len(block) >= 3:
            encoded_list.append(parse_encode_text_block(block, comp_encoded_trigrams))
    filename = language +'comp_encoded_test.jbl'
    root = 'comp_encoded_langs_test'
    jobsave(encoded_list, os.path.join(root, filename))
    del encoded_list

  0%|          | 121/404381 [00:00<05:35, 1206.34it/s]

bg


100%|██████████| 404381/404381 [02:40<00:00, 2517.48it/s]
  0%|          | 831/643491 [00:00<01:17, 8294.13it/s]

cs


100%|██████████| 643491/643491 [03:34<00:00, 2997.68it/s]
  0%|          | 251/1961219 [00:00<13:03, 2501.49it/s]

da


100%|██████████| 1961219/1961219 [12:37<00:00, 2589.63it/s]
  0%|          | 3/1915852 [00:00<17:46:24, 29.94it/s]

de


100%|██████████| 1915852/1915852 [14:17<00:00, 2234.71it/s]
  0%|          | 3/1226281 [00:00<11:56:07, 28.54it/s]

el


100%|██████████| 1226281/1226281 [08:54<00:00, 2294.77it/s]
  0%|          | 772/396827 [00:00<00:51, 7697.21it/s]

en


100%|██████████| 396827/396827 [02:23<00:00, 2770.86it/s]
  0%|          | 260/1957136 [00:00<12:34, 2592.59it/s]

es


100%|██████████| 1957136/1957136 [13:39<00:00, 2387.31it/s]
  0%|          | 16/648655 [00:00<1:10:29, 153.37it/s]

et


100%|██████████| 648655/648655 [04:00<00:00, 2698.12it/s]
  0%|          | 442/1917127 [00:00<14:30, 2202.16it/s]

fi


100%|██████████| 1917127/1917127 [14:21<00:00, 2225.82it/s]
  0%|          | 4/2000544 [00:00<14:45:04, 37.67it/s]

fr


100%|██████████| 2000544/2000544 [14:40<00:00, 2270.99it/s]
  0%|          | 7/621856 [00:00<2:31:02, 68.62it/s]

hu


100%|██████████| 621856/621856 [04:13<00:00, 2448.60it/s]
  0%|          | 441/1899872 [00:00<14:11, 2231.08it/s]

it


100%|██████████| 1899872/1899872 [15:58<00:00, 1982.77it/s]
  0%|          | 16/631187 [00:00<1:06:32, 158.09it/s]

lt


100%|██████████| 631187/631187 [03:42<00:00, 2841.86it/s]
  0%|          | 1420/635078 [00:00<01:26, 7341.80it/s]

lv


100%|██████████| 635078/635078 [04:05<00:00, 2589.36it/s]
  0%|          | 436/629205 [00:00<02:24, 4358.78it/s]

pl


100%|██████████| 629205/629205 [04:15<00:00, 2460.63it/s]
  0%|          | 245/1952785 [00:00<13:19, 2440.78it/s]

pt


100%|██████████| 1952785/1952785 [15:20<00:00, 2122.55it/s]
  0%|          | 19/396797 [00:00<35:02, 188.70it/s]

ro


100%|██████████| 396797/396797 [02:51<00:00, 2315.24it/s]
