In [1]:
import string
import xml.etree.ElementTree as ET
from collections import OrderedDict, defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from trie import Trie
import re
import unicodedata
import tqdm

2023-05-24 16:25:39.336159: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-24 16:25:39.374312: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-24 16:25:39.374961: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [19]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


## Load from xml file

In [20]:
#Import dataset
tree = ET.parse('lag1734.xml/lag1734.xml')
root = tree.getroot()
root.attrib

{'id': 'lag1734'}

In [23]:
chapters = []
for child in root:
    chapters.append(child)

sentances = []
for chapter in chapters:
    for paragraph in chapter:
        for sentence in paragraph:
            sentance = ''
            for child in sentence:
                sentance += f'{child.text} '
            sentances.append(sentance[:-1])
sentances

['D O M A R E R E G L E R. Någre almennelige Regler , ther en Domare skal sigh aldeles effter rätta .',
 'En Domare skal först besinna , at han en Gudz Befalningsman , och thet Embete han förer , thet hörer Gudh til , och icke honom sielffuom , och therföre hörer Domen , som han afsäger , Gudhi til , efter thet han afsagd warder i Gudz Embete på Gudz wegna , så at thet är wisserliga Gudz Dom , och icke Menniskiors .',
 'Och ty ligger Domaren ther Macht vppå , at han seer sigh wijsligen före , at han icke på Gudz wegna dömer en falskan Dom , med hwilken han dömer sig til en ewigh Fördömelse , effter thet han misbrukat Guds Dom och Befalning til Öffuerwold och Orätt , som til Rätt af Gudhi insatt är .',
 'Men ther han haffuer wilia til at döma Rätt , och ransakar grant effter sitt ytersta Förstånd om Rätten , och kan dock icke för sin Oförståndigheet finna på Rätten , och säger så en falsk Dom , tå haffuer han någor Vrsächt , at han är kommen på then falska Domen , emot sin wilia aff wåd

## Load from txt file

In [2]:
with open('wikipedia-sv.xml/wikipedia-sv.txt', 'r', encoding='utf-8') as f:
    words = f.readlines()
    sentances = [word.strip().lower() for word in words]
sentances = sentances[:len(sentances)//10]

In [3]:
len(sentances)

19390072

# Stopwords

In [21]:
# Import stopwords
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stop_words = [line.strip() for line in f.readlines()]
stop_words

['aderton',
 'adertonde',
 'adjö',
 'aldrig',
 'alla',
 'allas',
 'allt',
 'alltid',
 'alltså',
 'andra',
 'andras',
 'annan',
 'annat',
 'artonde',
 'artonn',
 'att',
 'av',
 'bakom',
 'bara',
 'behöva',
 'behövas',
 'behövde',
 'behövt',
 'beslut',
 'beslutat',
 'beslutit',
 'bland',
 'blev',
 'bli',
 'blir',
 'blivit',
 'bort',
 'borta',
 'bra',
 'bäst',
 'bättre',
 'båda',
 'bådas',
 'dag',
 'dagar',
 'dagarna',
 'dagen',
 'de',
 'del',
 'delen',
 'dem',
 'den',
 'denna',
 'deras',
 'dess',
 'dessa',
 'det',
 'detta',
 'dig',
 'din',
 'dina',
 'dit',
 'ditt',
 'dock',
 'dom',
 'du',
 'där',
 'därför',
 'då',
 'e',
 'efter',
 'eftersom',
 'ej',
 'elfte',
 'eller',
 'elva',
 'emot',
 'en',
 'enkel',
 'enkelt',
 'enkla',
 'enligt',
 'ens',
 'er',
 'era',
 'ers',
 'ert',
 'ett',
 'ettusen',
 'fanns',
 'fem',
 'femte',
 'femtio',
 'femtionde',
 'femton',
 'femtonde',
 'fick',
 'fin',
 'finnas',
 'finns',
 'fjorton',
 'fjortonde',
 'fjärde',
 'fler',
 'flera',
 'flesta',
 'fram',
 'framf

In [22]:
stopword_trie = Trie()
stopword_trie.add_multiple(*stop_words)
stop_words_regex = re.compile(r'\b' + stopword_trie.pattern() + r'\b', re.IGNORECASE)


In [4]:
#Pre-processing
for idx, sentance in enumerate(sentances):
    sentance = unicodedata.normalize('NFKC', sentance)
    # sentance = stop_words_regex.sub('', sentance)
    sentances[idx] = sentance

In [25]:
#Tokenization
tfidf = TfidfVectorizer(stop_words = stop_words)
X = tfidf.fit_transform(sentances)
print(*tfidf.get_feature_names_out())



In [26]:
feature_array = np.array(tfidf.get_feature_names_out())
tfidf_sorting = np.argsort(X.toarray().sum(axis=0))[::-1]
n = 100
top_n = feature_array[tfidf_sorting[:n]]
print(top_n)

['cap' 'thet' 'then' 'ther' 'til' 'at' 'tå' 'the' 'af' 'böte' 'daler'
 'sagdt' 'må' 'någor' 'ock' 'skal' '10' 'vare' 'them' 'hafver' '11' 'äro'
 'domaren' 'giör' 'konungens' 'åter' 'lag' 'hafve' 'sägs' 'saken' 'förr'
 'thes' 'gånge' 'gods' 'vil' '12' 'hvar' 'skada' 'tid' '13' 'bör' 'sker'
 'huru' 'rätten' 'hus' 'hofrätten' 'laga' 'annars' 'tijo' 'ware' 'äntå'
 'balken' 'ifrån' 'staden' 'sielf' 'jord' 'skadan' 'alt' 'skola' 'äger'
 'barn' 'landet' 'thertil' 'niute' 'plichte' 'hvad' 'konungen' 'stadgadt'
 '14' 'gifve' 'tage' 'hafva' 'gälde' 'finnes' 'varder' 'måge' 'theras'
 'tiugu' 'öfver' 'fä' 'thy' 'miste' 'up' 'åhr' 'hos' 'hafwer' 'emellan'
 'skiäl' 'therom' 'ske' 'arf' 'fängelse' 'befalningshafvande' 'mål' '16'
 '15' 'theraf' 'mans' 'stånde' 'lof']


In [27]:
feature_array = np.array(tfidf.get_feature_names_out())
tfidf_sorting = np.argsort(X.toarray().sum(axis=0))
n = 1000
top_n = feature_array[tfidf_sorting[:n]]
print(top_n)

['västerbotn' 'södermanland' 'skaraborgs' 'rautalambi' 'nyland' 'kalmare'
 'skåne' 'kymmenegårds' 'bohuslän' 'dahl' 'gestrikeland' 'ångermanland'
 'vermeland' 'ingifve' 'upland' 'nerike' 'kopparbergs' 'västmanland'
 'blekinge' 'halland' 'östergöthland' 'österbotn' 'herjedalen' 'jämteland'
 'gothland' 'helsingeland' 'småland' 'göta' 'tavastehus' 'lagmansdomen'
 'medelpad' 'elfsborgs' 'förändringar' 'riksdag' 'företrädare' 'brukliga'
 'adolph' 'samtelige' 'stadslagens' 'behöringen' 'sorgfällighet'
 'esomoftast' 'ändskap' 'berömmelig' 'brukelig' 'mångahanda' 'våre'
 'vittre' 'gångne' 'gustaf' 'förändrat' 'ändrade' 'hälsosamt' 'sedvana'
 'angelägit' 'förständiga' 'svårigheter' 'förbättra' '1731' '1618' 'daga'
 'öfversedde' 'emellankomna' 'förbättrade' 'önskan' 'högloflige' 'härtils'
 'ehuruväl' 'sinnande' 'svänska' 'nöigt' 'lagfarne' 'utgifvande'
 'utesluta' 'bringas' 'högtärade' 'lärda' 'moederfaders' 'grundval' 'fant'
 'förenad' 'månsons' 'stadfästades' 'efterlevande' 'spa' 'födelse'
 'p

In [5]:
from collections import Counter
counter = Counter(sentances)
bpe_words = [(' '.join(list(key)) + ' </w>', item) for key, item in counter.items()]
bpe_words[:100]

[('. d k </w>', 7),
 ('ä r </w>', 195042),
 ('" </w>', 124671),
 ('c o d e </w>', 78),
 ('t o p - l e v e l </w>', 1),
 ('d o m a i n </w>', 25),
 ('( </w>', 241543),
 ('c c t l d </w>', 1),
 (') </w>', 241488),
 ('f ö r </w>', 149614),
 ('d a n m a r k </w>', 3029),
 ('. </w>', 938761),
 ('ö v e r s e e n d e t </w>', 2),
 ('a v </w>', 275343),
 ('t o p p n i v å d o m ä n e r </w>', 1),
 ('h a n t e r a s </w>', 91),
 ('h e l t </w>', 4112),
 ('o c h </w>', 504766),
 ('h å l l e t </w>', 258),
 ('d k </w>', 18),
 ('h o s t m a s t e r </w>', 2),
 ('a l l a </w>', 11607),
 ('n y a </w>', 10708),
 ('d o m ä n n a m n </w>', 13),
 ('m å s t e </w>', 3069),
 ('a n s ö k a s </w>', 2),
 ('o m </w>', 54782),
 ('v i a </w>', 3240),
 ('e n </w>', 271444),
 ('g o d k ä n d </w>', 113),
 ('r e g i s t r a t o r </w>', 12),
 ('d ä r e f t e r </w>', 7172),
 ('k a n </w>', 25897),
 ('d e n </w>', 185239),
 ('s ö k a n d e </w>', 119),
 ('b e </w>', 1029),
 ('r e g i s t r a t o r n </w>', 1),
 (

In [4]:
counter = CountVectorizer()
Y = counter.fit_transform(sentances)
Y_sum = Y.toarray().sum(axis=0)
bpe_words = list(zip(map(lambda x: ' '.join(list(x)) + ' </w>', counter.get_feature_names_out()), Y_sum))
bpe_words

MemoryError: Unable to allocate 4.20 PiB for an array with shape (193900721, 3049430) and data type int64

In [7]:
import itertools
def get_pair_stats(vocab: 'list[tuple[str, int]]'):
    pairs: 'dict[tuple[str], int]' = defaultdict(lambda: 0)
    for word, frequency in vocab:
        symbols = word.split()

        # count occurrences of pairs
        for pair in itertools.pairwise(symbols):
            pairs[pair] += frequency

    return pairs

# pairs = get_pair_stats(bpe_words)
# pairs[:100]

In [8]:
from multiprocessing import Pool
import itertools

def mp_replace(word, regx):
    word, freq = word
    pattern, replacement = regx
    word_out = re.sub(pattern, replacement, word)
    return word_out, freq

def merge_vocab(best_pair: 'tuple[str, str]', vocab_in: 'list[tuple[str, int]]'):

    vocab_out: 'dict[str, int]' = {}

    # re.escape
    # ensures the characters of our input pair will be handled as is and
    # not get mistreated as special characters in the regular expression.
    pattern = r'( |^)' + re.escape(' '.join(best_pair)) + r'( |$)'
    replacement = r'\g<1>' + ''.join(best_pair) + r'\g<2>'
    vocab_in = zip(vocab_in, itertools.repeat((pattern, replacement)))
    # print(next(vocab_in))
    p = Pool(16)
    with p:
        vocab_out = p.starmap(mp_replace, vocab_in, chunksize=56730) #len(vocab_in)//64
    # for word_in, freq in vocab_in:
    #     # replace most frequent pair in all vocabulary
    #     word_out = re.sub(pattern, replacement, word_in)
    #     vocab_out[word_out] = freq

    return vocab_out

In [12]:
def merge_vocab(best_pair: 'tuple[str, str]', vocab_in: 'list[tuple[str, int]]'):

    vocab_out: 'dict[str, int]' = {}

    # re.escape
    # ensures the characters of our input pair will be handled as is and
    # not get mistreated as special characters in the regular expression.
    pattern = r'( |^)' + re.escape(' '.join(best_pair)) + r'( |$)'
    replacement = r'\g<1>' + ''.join(best_pair) + r'\g<2>'

    for word_in, freq in vocab_in:
        # replace most frequent pair in all vocabulary
        word_out = re.sub(pattern, replacement, word_in)
        vocab_out[word_out] = freq

    return [(word, frequency) for word, frequency in vocab_out.items()]

In [24]:
best_pair = max(pairs, key=pairs.get)
print(best_pair)

new_vocab = merge_vocab(best_pair, bpe_words)
new_vocab[:100]

('e', 'n</w')


[('. d k</w', 14),
 ('ä r</w', 1940078),
 ('"</w', 1260961),
 ('c o d e</w', 998),
 ('t o p - l e v e l</w', 7),
 ('d o m a i n</w', 252),
 ('(</w', 2465049),
 ('c c t l d</w', 3),
 (')</w', 2464100),
 ('f ö r</w', 1475139),
 ('d a n m a r k</w', 31242),
 ('.</w', 9372584),
 ('ö v e r s e e n d e t</w', 7),
 ('a v</w', 2747729),
 ('t o p p n i v å d o m ä n e r</w', 1),
 ('h a n t e r a s</w', 930),
 ('h e l t</w', 41249),
 ('o c h</w', 5036651),
 ('h å l l e t</w', 2329),
 ('d k</w', 269),
 ('h o s t m a s t e r</w', 2),
 ('a l l a</w', 115821),
 ('n y a</w', 108328),
 ('d o m ä n n a m n</w', 148),
 ('m å s t e</w', 31439),
 ('a n s ö k a s</w', 11),
 ('o m</w', 549212),
 ('v i a</w', 32608),
 ('en</w', 2726754),
 ('g o d k ä n d</w', 1247),
 ('r e g i s t r a t o r</w', 121),
 ('d ä r e f t e r</w', 70146),
 ('k a n</w', 257758),
 ('d en</w', 1852171),
 ('s ö k a n d e</w', 1326),
 ('b e</w', 10349),
 ('r e g i s t r a t o r n</w', 18),
 ('a t t</w', 1664158),
 ('t a</w', 42934),
 (

In [16]:
import timeit

print(timeit.timeit('c[1]', setup = 'from collections import defaultdict; c = defaultdict(lambda: 0); c[1]=1'))
timeit.timeit('c.get(1, 0)', setup = 'c = {1: 1}')

0.023796564999429393


0.0362754669986316

In [18]:
vocab = bpe_words
for pair in tqdm.tqdm(bpe_codes):
    # print(pair)
    vocab = merge_vocab(pair, vocab)

  1%|          | 27/2588 [03:46<5:41:48,  8.01s/it]

In [9]:
import pickle


bpe_codes = OrderedDict()
num_merges = 10_000  # hyperparameter
vocab = bpe_words
for i in tqdm.tqdm(range(num_merges)):
    # print('\niteration', i)
    pair_stats = get_pair_stats(vocab)
    if not pair_stats:
        break

    best_pair = max(pair_stats, key=pair_stats.get)
    bpe_codes[best_pair] = i

    with open('bpe.pckl', 'wb') as f:
        pickle.dump(bpe_codes, f)
    # print('vocabulary: ', vocab)
    # print('best pair:', best_pair)
    vocab = merge_vocab(best_pair, vocab)

# print('\nfinal vocabulary: ', vocab)
# print('\nbyte pair encoding: ', bpe_codes)


100%|██████████| 10000/10000 [14:12:55<00:00,  5.12s/it] 


In [13]:
# import pickle
# with open('bpe.pckl', 'rb') as fd:
#     bpe_codes = pickle.load(fd)
# len(bpe_codes)

2588

In [37]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [54]:
single_char_vocab = list(set((string.printable[:-5]+ 'åäö').lower())) + ['</w>', '[END]']
tokenizer_dict = defaultdict(lambda: 1)
tokenizer_dict['<pad>'] = 0
for idx, char in enumerate(single_char_vocab):
    tokenizer_dict[char] = idx+2
tokenizer_dict.update({''.join(word): idx+len(tokenizer_dict)+2 for word, idx in bpe_codes.items()})
# tokenizer_list
with open('tokens.pckl', 'wb') as f:
    pickle.dump(dict(tokenizer_dict), f)
tokenizer_dict


defaultdict(<function __main__.<lambda>()>,
            {'<pad>': 0,
             '$': 2,
             ')': 3,
             '!': 4,
             'k': 5,
             ']': 6,
             '^': 7,
             ' ': 8,
             '8': 9,
             'c': 10,
             'p': 11,
             'l': 12,
             '.': 13,
             '/': 14,
             '"': 15,
             'r': 16,
             'v': 17,
             'å': 18,
             '_': 19,
             'z': 20,
             'x': 21,
             '@': 22,
             '\\': 23,
             "'": 24,
             't': 25,
             '#': 26,
             '~': 27,
             '*': 28,
             '1': 29,
             '<': 30,
             'y': 31,
             '=': 32,
             '&': 33,
             'b': 34,
             'n': 35,
             'f': 36,
             'q': 37,
             'e': 38,
             'a': 39,
             ':': 40,
             '(': 41,
             'u': 42,
             '+': 43,
             '

In [47]:
print(*tokenizer_dict.items(), sep='\n')

('<pad>', 0)
('$', 2)
(')', 3)
('!', 4)
('k', 5)
(']', 6)
('^', 7)
(' ', 8)
('8', 9)
('c', 10)
('p', 11)
('l', 12)
('.', 13)
('/', 14)
('"', 15)
('r', 16)
('v', 17)
('å', 18)
('_', 19)
('z', 20)
('x', 21)
('@', 22)
('\\', 23)
("'", 24)
('t', 25)
('#', 26)
('~', 27)
('*', 28)
('1', 29)
('<', 30)
('y', 31)
('=', 32)
('&', 33)
('b', 34)
('n', 35)
('f', 36)
('q', 37)
('e', 38)
('a', 39)
(':', 40)
('(', 41)
('u', 42)
('+', 43)
('>', 44)
('6', 45)
('s', 46)
('?', 47)
('5', 48)
('4', 49)
('-', 50)
('}', 51)
('ä', 52)
('[', 53)
('m', 54)
('9', 55)
('%', 56)
('d', 57)
('{', 58)
('o', 59)
('w', 60)
('`', 61)
('ö', 62)
(';', 63)
('3', 64)
(',', 65)
('j', 66)
('2', 67)
('i', 68)
('0', 69)
('h', 70)
('g', 71)
('7', 72)
('|', 73)
('</w>', 74)
('[END]', 75)
('n</w>', 77)
('r</w>', 78)
('de', 79)
('t</w>', 80)
('a</w>', 81)
('s</w>', 82)
('en</w>', 83)
('an', 84)
('.</w>', 85)
('in', 86)
('st', 87)
('er', 88)
('ar', 89)
('e</w>', 90)
('i</w>', 91)
(',</w>', 92)
('d</w>', 93)
('ll', 94)
('om', 95)
('ch

In [89]:
with open('tokens_fixed.pckl', 'wb') as f:
    pickle.dump(dict(tokenizer_dict), f)

### Regex challenges
1. Naive implementation - Slow (45 minutes entire 1734 law 1000 tokens)
2. Trie - No priority
3. \b Word boundaries - '>' counts as a word boundary
4. Tensorflow - Final solution (2 minutes entire 1734 law 5000 tokens)

In [50]:
list(tokenizer_dict.items())[:200]

[('<pad>', -2),
 ('$', 0),
 (')', 1),
 ('!', 2),
 ('k', 3),
 (']', 4),
 ('^', 5),
 (' ', 6),
 ('8', 7),
 ('c', 8),
 ('p', 9),
 ('l', 10),
 ('.', 11),
 ('/', 12),
 ('"', 13),
 ('r', 14),
 ('v', 15),
 ('å', 16),
 ('_', 17),
 ('z', 18),
 ('x', 19),
 ('@', 20),
 ('\\', 21),
 ("'", 22),
 ('t', 23),
 ('#', 24),
 ('~', 25),
 ('*', 26),
 ('1', 27),
 ('<', 28),
 ('y', 29),
 ('=', 30),
 ('&', 31),
 ('b', 32),
 ('n', 33),
 ('f', 34),
 ('q', 35),
 ('e', 36),
 ('a', 37),
 (':', 38),
 ('(', 39),
 ('u', 40),
 ('+', 41),
 ('>', 42),
 ('6', 43),
 ('s', 44),
 ('?', 45),
 ('5', 46),
 ('4', 47),
 ('-', 48),
 ('}', 49),
 ('ä', 50),
 ('[', 51),
 ('m', 52),
 ('9', 53),
 ('%', 54),
 ('d', 55),
 ('{', 56),
 ('o', 57),
 ('w', 58),
 ('`', 59),
 ('ö', 60),
 (';', 61),
 ('3', 62),
 (',', 63),
 ('j', 64),
 ('2', 65),
 ('i', 66),
 ('0', 67),
 ('h', 68),
 ('g', 69),
 ('7', 70),
 ('|', 71),
 ('</w>', 72),
 ('[END]', 73),
 ('n</w>', 75),
 ('r</w>', 76),
 ('de', 77),
 ('t</w>', 78),
 ('a</w>', 79),
 ('s</w>', 80),
 ('en

In [55]:
def tokenize(corpus: list[str], bpe: 'OrderedDict[tuple[str, str], int]'):
    corpus = [sentance.lower() for sentance in corpus]
    words = []
    for sentance in corpus:
        words += [' '.join(list(word)) + ' </w>' for word in sentance.split()]
        if sentance == '.':
            words += ['[END]']
    del corpus[:]
    del corpus
    str_tensor = tf.constant(words)
    print(len(words))
    for pair in tqdm.tqdm(bpe):
        pattern = r'( |^)' + re.escape(' '.join(pair)) + r'( |$)'
        replacement = r'\1' + ''.join(pair) + r'\2'
        str_tensor = tf.strings.regex_replace(str_tensor, pattern, replacement)
    tokens = []
    token_sentance = []
    for idx, word in enumerate(str_tensor):
        word = word.numpy().decode('utf-8')
        bpe_tokens = word.split()
        tokenization = [tokenizer_dict[token] for token in bpe_tokens]
        token_sentance += tokenization
        if word == '[END]':
            tokens.append(token_sentance)
            token_sentance = []
    return tokens

tokenized_corpus = tokenize(sentances, bpe_codes)
with open('tokenized_corpus_fixed.pckl', 'wb') as f:
    pickle.dump(tokenized_corpus, f)
# print(*tokenized_corpus, sep='\n')

19579155


100%|██████████| 9998/9998 [6:41:40<00:00,  2.41s/it]  


In [2]:
import pickle
with open('tokenized_corpus_fixed.pckl', 'rb') as f:
    tokenized_corpus = pickle.load(f)
with open('tokens_fixed.pckl', 'rb') as f:
    tokenizer_dict = pickle.load(f)

# Embedding

In [3]:
inverse_vocab = {index: token for token, index in tokenizer_dict.items()}

In [8]:
print(*tokenizer_dict.items(), sep='\n')

('<pad>', 0)
('$', 2)
(')', 3)
('!', 4)
('k', 5)
(']', 6)
('^', 7)
(' ', 8)
('8', 9)
('c', 10)
('p', 11)
('l', 12)
('.', 13)
('/', 14)
('"', 15)
('r', 16)
('v', 17)
('å', 18)
('_', 19)
('z', 20)
('x', 21)
('@', 22)
('\\', 23)
("'", 24)
('t', 25)
('#', 26)
('~', 27)
('*', 28)
('1', 29)
('<', 30)
('y', 31)
('=', 32)
('&', 33)
('b', 34)
('n', 35)
('f', 36)
('q', 37)
('e', 38)
('a', 39)
(':', 40)
('(', 41)
('u', 42)
('+', 43)
('>', 44)
('6', 45)
('s', 46)
('?', 47)
('5', 48)
('4', 49)
('-', 50)
('}', 51)
('ä', 52)
('[', 53)
('m', 54)
('9', 55)
('%', 56)
('d', 57)
('{', 58)
('o', 59)
('w', 60)
('`', 61)
('ö', 62)
(';', 63)
('3', 64)
(',', 65)
('j', 66)
('2', 67)
('i', 68)
('0', 69)
('h', 70)
('g', 71)
('7', 72)
('|', 73)
('</w>', 74)
('[END]', 75)
('n</w>', 76)
('r</w>', 77)
('de', 78)
('t</w>', 79)
('a</w>', 80)
('s</w>', 81)
('en</w>', 82)
('an', 83)
('.</w>', 84)
('in', 85)
('st', 86)
('er', 87)
('ar', 88)
('e</w>', 89)
('i</w>', 90)
(',</w>', 91)
('d</w>', 92)
('ll', 93)
('om', 94)
('ch

In [4]:
window_size = 1
vocab_size = len([token for token in tokenizer_dict.values() if token != 1]) +1
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      tokenized_corpus[0],
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
positive_skip_grams

[[2353, 84],
 [172, 133],
 [133, 10],
 [169, 57],
 [10, 133],
 [13, 57],
 [25, 10],
 [84, 2353],
 [157, 2353],
 [292, 103],
 [6079, 652],
 [124, 169],
 [57, 13],
 [10, 25],
 [652, 6079],
 [652, 290],
 [290, 652],
 [2353, 157],
 [169, 57],
 [409, 103],
 [3275, 292],
 [10, 10],
 [10, 10],
 [2154, 25],
 [409, 172],
 [157, 132],
 [132, 2154],
 [75, 84],
 [2154, 132],
 [133, 172],
 [57, 13],
 [1416, 290],
 [1416, 3275],
 [124, 172],
 [57, 169],
 [103, 409],
 [172, 6079],
 [169, 13],
 [169, 124],
 [172, 124],
 [25, 2154],
 [3275, 1416],
 [132, 157],
 [6079, 172],
 [172, 409],
 [57, 169],
 [84, 75],
 [292, 3275],
 [13, 57],
 [13, 169],
 [290, 1416],
 [103, 292]]

In [13]:
print([inverse_vocab[t] for t in tokenized_corpus[0]])

['.', 'd', 'k</w>', '.', 'd', 'k</w>', 'är</w>', '"</w>', 'co', 'de</w>', 'to', 'p-', 'lev', 'el</w>', 'dom', 'ain</w>', '"</w>', '(</w>', 'c', 'c', 't', 'ld</w>', ')</w>', 'för</w>', 'danmark</w>', '.</w>', '[END]']


In [16]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(172, 133): ("</w>, (</w>)
(84, 75): (.</w>, [END])
(103, 292): (de</w>, to)
(133, 172): ((</w>, "</w>)
(1416, 290): (lev, el</w>)


In [5]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    # seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([4964    1 2430    0], shape=(4,), dtype=int64)
['nie</w>', 'ѡ', 'allt', '<pad>']


2023-05-24 16:25:59.363581: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [6]:
# Reduce a dimension so you can use concatenation (in the next step).
squeezed_context_class = tf.squeeze(context_class, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word

In [19]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 172
target_word     : "</w>
context_indices : [ 133    6   96 4028 2600]
context_words   : ['(</w>', ']', 'er</w>', 'spanska</w>', 'spela</w>']
label           : [1 0 0 0 0]


In [20]:
print("target  :", target)
print("context :", context)
print("label   :", label)

target  : 172
context : tf.Tensor([ 133    6   96 4028 2600], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [6]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
from multiprocessing import Pool
import itertools

def generate_skipgram(sequence, params):
  targets, contexts = [], []
  window_size, num_ns, vocab_size, sampling_table = params
  positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
        sequence,
        vocabulary_size=vocab_size,
        sampling_table=sampling_table,
        window_size=window_size,
        negative_samples=0)

  # Iterate over each positive skip-gram pair to produce training examples
  # with a positive context word and negative samples.
  for target_word, context_word in positive_skip_grams:
    context_class = tf.expand_dims(
        tf.constant([context_word], dtype="int64"), 1)
    negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
        true_classes=context_class,
        num_true=1,
        num_sampled=num_ns,
        unique=True,
        range_max=vocab_size,
        seed=42,
        name="negative_sampling")
    # Build context and label vectors (for one target word)
    context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)

    targets.append(target_word)
    contexts.append(tf.cast(context, 'uint16'))
  return targets, contexts

def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  pool = Pool(16)
  
  params = zip(sequences, itertools.repeat((window_size, num_ns, vocab_size, sampling_table)))
  res = pool.starmap(generate_skipgram, tqdm.tqdm(params, total=len(sequences)), chunksize=len(sequences)//64)
  # for sequence in tqdm.tqdm(sequences):

  #   # Generate positive skip-gram pairs for a sequence (sentence).
  #   positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
  #         sequence,
  #         vocabulary_size=vocab_size,
  #         sampling_table=sampling_table,
  #         window_size=window_size,
  #         negative_samples=0)

  #   # Iterate over each positive skip-gram pair to produce training examples
  #   # with a positive context word and negative samples.
  #   for target_word, context_word in positive_skip_grams:
  #     context_class = tf.expand_dims(
  #         tf.constant([context_word], dtype="int64"), 1)
  #     negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
  #         true_classes=context_class,
  #         num_true=1,
  #         num_sampled=num_ns,
  #         unique=True,
  #         range_max=vocab_size,
  #         seed=seed,
  #         name="negative_sampling")

  #     # Build context and label vectors (for one target word)
  #     context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
  #     label = tf.constant([1] + [0]*num_ns, dtype="int64")

  #     # Append each element from the training example to global lists.
  #     targets.append(target_word)
  #     contexts.append(context)
  #     labels.append(label)

  return res # targets, contexts, labels

In [7]:
res = generate_training_data(tokenized_corpus, window_size=1, num_ns=4, vocab_size=vocab_size, seed=42)

100%|██████████| 938761/938761 [06:37<00:00, 2362.76it/s]  


In [29]:
with open('labels.pckl', 'wb') as fd:
    pickle.dump(labels, fd)

In [28]:
labels = np.zeros(contexts.shape, dtype='uint8')
labels[:,0] = 1
labels

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]], dtype=uint8)

In [8]:
res[0]

([103, 103, 290, 172, 172, 290, 3275, 3275],
 [<tf.Tensor: shape=(5,), dtype=uint16, numpy=array([ 409,  297,   27, 1709,   88], dtype=uint16)>,
  <tf.Tensor: shape=(5,), dtype=uint16, numpy=array([292,   1,  29, 136,  16], dtype=uint16)>,
  <tf.Tensor: shape=(5,), dtype=uint16, numpy=array([1416,  516, 3637,    4,   12], dtype=uint16)>,
  <tf.Tensor: shape=(5,), dtype=uint16, numpy=array([6079,   66, 9049, 2188,    0], dtype=uint16)>,
  <tf.Tensor: shape=(5,), dtype=uint16, numpy=array([ 133,    3,   11, 6248,    2], dtype=uint16)>,
  <tf.Tensor: shape=(5,), dtype=uint16, numpy=array([ 652,   99, 2041,   78,    1], dtype=uint16)>,
  <tf.Tensor: shape=(5,), dtype=uint16, numpy=array([1416, 3621,  561,  841,  261], dtype=uint16)>,
  <tf.Tensor: shape=(5,), dtype=uint16, numpy=array([292, 105,   1, 525, 114], dtype=uint16)>])

In [10]:
targets = []
for result in tqdm.tqdm(res):
    targets += result[0]
print(len(targets))
targets = np.fromiter(tqdm.tqdm(targets), dtype='uint16')
targets

100%|██████████| 938761/938761 [00:00<00:00, 2094875.84it/s]


15810050


100%|██████████| 15810050/15810050 [00:02<00:00, 7822925.67it/s]


array([ 103,  103,  290, ..., 6012, 3252,  493], dtype=uint16)

In [11]:
contexts.shape

(15810050, 5)

In [21]:
labels

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]], dtype=uint8)

In [85]:
targets.shape

(31247556,)

In [2]:
import pickle
with open('tokens_fixed.pckl', 'rb') as f:
    tokenizer_dict = pickle.load(f)
with open('targets.pckl', 'rb') as fd:
    targets = pickle.load(fd)
with open('contexts.pckl', 'rb') as fd:
    contexts = pickle.load(fd)
with open('labels.pckl', 'rb') as fd:
    labels = pickle.load(fd)
vocab_size = len([token for token in tokenizer_dict.values() if token != 1]) +1
num_ns = 4

In [34]:

contexts

array([[2154, 2154, 2154, 2154, 2154],
       [ 133,  133,  133,  133,  133],
       [ 172,  172,  172,  172,  172],
       ...,
       [ 464,  464,  464,  464,  464],
       [ 392,  392,  392,  392,  392],
       [4412, 4412, 4412, 4412, 4412]], dtype=uint16)

In [19]:
BATCH_SIZE = 8192
BUFFER_SIZE = 5_000_000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(8192,), dtype=tf.uint16, name=None), TensorSpec(shape=(8192, 5), dtype=tf.uint16, name=None)), TensorSpec(shape=(8192, 5), dtype=tf.float64, name=None))>


In [31]:
list(dataset)

2023-05-24 16:04:40.341044: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 7570067 of 50000000
2023-05-24 16:04:50.341041: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 15185093 of 50000000
2023-05-24 16:05:00.341040: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 22925706 of 50000000
2023-05-24 16:05:10.341041: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 30671294 of 50000000
2023-05-24 16:05:11.085612: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


KeyboardInterrupt: 

In [17]:
class EmbeddingModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(EmbeddingModel, self).__init__()
    self.target_embedding = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    target, context = pair
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [20]:
embedding_dim = 500

model = EmbeddingModel(vocab_size, embedding_dim)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.005, patience=2)

model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit(dataset, epochs=100, callbacks=[early_stopping_callback])

Epoch 1/100


2023-05-24 16:38:24.879402: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype uint16 and shape [15810050,5]
	 [[{{node Placeholder/_1}}]]
2023-05-24 16:38:24.879680: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype uint16 and shape [15810050,5]
	 [[{{node Placeholder/_1}}]]


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.callbacks.History at 0x7fa2e8386ec0>

In [72]:
tf.keras.backend.clear_session()

In [23]:
model.summary()

Model: "embedding_model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 w2v_embedding (Embedding)   multiple                  2014800   
                                                                 
 embedding_6 (Embedding)     multiple                  2014800   
                                                                 
Total params: 4,029,600
Trainable params: 4,029,600
Non-trainable params: 0
_________________________________________________________________


In [60]:
%tensorboard --logdir logs

In [21]:
weights = model.get_layer('w2v_embedding').get_weights()[0]
# vocab = list(tokenizer_dict.keys())

In [22]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

out_v.write('\t'.join([str(0) for _ in range(embedding_dim)]) + '\n')
out_m.write('<pad>\t0\n')
out_v.write('\t'.join([str(0) for _ in range(embedding_dim)]) + '\n')
out_m.write('<oov>\t1\n')

for index, (word, token) in enumerate([(word, token) for word, token in tokenizer_dict.items() if token >= 2], start=2):
  vec = weights[index]
  # out_v.write(word + ',')
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(f'{word}\t{token+2}\n')
out_v.close()
out_m.close()

In [27]:
with open('bpe_fixed.pckl', 'rb') as fd:
    bpe = pickle.load(fd)
print(*bpe.items(), sep='\n')

(('n', '</w>'), 0)
(('r', '</w>'), 1)
(('d', 'e'), 2)
(('t', '</w>'), 3)
(('a', '</w>'), 4)
(('s', '</w>'), 5)
(('e', 'n</w>'), 6)
(('a', 'n'), 7)
(('.', '</w>'), 8)
(('i', 'n'), 9)
(('s', 't'), 10)
(('e', 'r'), 11)
(('a', 'r'), 12)
(('e', '</w>'), 13)
(('i', '</w>'), 14)
((',', '</w>'), 15)
(('d', '</w>'), 16)
(('l', 'l'), 17)
(('o', 'm'), 18)
(('c', 'h'), 19)
(('e', 'r</w>'), 20)
(('t', 'i'), 21)
(('s', 'k'), 22)
(('e', 'n'), 23)
(('ch', '</w>'), 24)
(('o', 'r'), 25)
(('o', 'ch</w>'), 26)
(('de', '</w>'), 27)
(('f', 'ö'), 28)
(('g', '</w>'), 29)
(('a', 't'), 30)
(('a', 'r</w>'), 31)
(('a', 'n</w>'), 32)
(('a', 'l'), 33)
(('om', '</w>'), 34)
(('a', 'v'), 35)
(('o', 'n'), 36)
(('a', 'm'), 37)
(('l', 'i'), 38)
(('r', 'e'), 39)
(('u', 'n'), 40)
(('e', 't</w>'), 41)
(('r', 'i'), 42)
(('e', 't'), 43)
(('1', '9'), 44)
(('å', '</w>'), 45)
(('m', 'e'), 46)
(('av', '</w>'), 47)
(('ä', 'r</w>'), 48)
(('e', 'l'), 49)
(('s', 'i'), 50)
(('de', 'n</w>'), 51)
(('c', 'k'), 52)
(('v', 'i'), 53)
(('s',

# TODO
* <s>Load stopwords from file</s>
* <s>Remove stopwords from corpus and vocab</s>
* <s>Use entire law corpus</s>
* <s>Improve BPE performance</s>
* Use Wikipedia corpus
* Create embedder from weights