In [1]:
import os
import app
import Models.pytorch_joy_and_anger.joy_and_anger_utils as utils

In [2]:
from torchtext.data.utils import get_tokenizer

Establish our pytorch factory methods we are trying to reverse engineer from python code into java code:

In [3]:
pt_tokenizer = get_tokenizer("basic_english")

In [4]:
import re

_patterns = [r'\'',
             r'\"',
             r'\.',
             r'<br \/>',
             r',',
             r'\(',
             r'\)',
             r'\!',
             r'\?',
             r'\;',
             r'\:',
             r'\s+']

_replacements = [' \'  ',
                 '',
                 ' . ',
                 ' ',
                 ' , ',
                 ' ( ',
                 ' ) ',
                 ' ! ',
                 ' ? ',
                 ' ',
                 ' ',
                 ' ']

_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))

In [5]:
_patterns_dict

[(re.compile(r"\'", re.UNICODE), " '  "),
 (re.compile(r'\"', re.UNICODE), ''),
 (re.compile(r'\.', re.UNICODE), ' . '),
 (re.compile(r'<br \/>', re.UNICODE), ' '),
 (re.compile(r',', re.UNICODE), ' , '),
 (re.compile(r'\(', re.UNICODE), ' ( '),
 (re.compile(r'\)', re.UNICODE), ' ) '),
 (re.compile(r'\!', re.UNICODE), ' ! '),
 (re.compile(r'\?', re.UNICODE), ' ? '),
 (re.compile(r'\;', re.UNICODE), ' '),
 (re.compile(r'\:', re.UNICODE), ' '),
 (re.compile(r'\s+', re.UNICODE), ' ')]

In [6]:
def _basic_english_normalize(line):
    r"""
    Basic normalization for a line of text.
    Normalization includes
    - lowercasing
    - complete some basic text normalization for En glish words as follows:
        add spaces before and after '\''
        remove '\"',
        add spaces before and after '.'
        replace '<br \/>'with single space
        add spaces before and after ','
        add spaces before and after '('
        add spaces before and after ')'
        add spaces before and after '!'
        add spaces before and after '?'
        replace ';' with single space
        replace ':' with single space
        replace multiple spaces with single space

    Returns a list of tokens after splitting on whitespace.
    """

    line = line.lower()
    for pattern_re, replaced_str in _patterns_dict:
        line = pattern_re.sub(replaced_str, line)
    return line.split()


In [7]:
tokenizer = lambda line: _basic_english_normalize(line)

In [8]:
tokenizer("hello, world")

['hello', ',', 'world']

In [9]:
train_ds = utils.HappyClassifierDataset("train.txt", probabilistic=True)

loaded 7520 items
{'joy': 0.0, 'anger': 1.0}
('im grabbing a minute to post i feel greedy wrong', 1)


In [10]:
#Create a test method in similar syntax to java
def test_tokenizer(pt_pipeline, deciphered_pipeline, ds):
    accuracy = 0
    total_count = 0
    for i, (text, label) in enumerate(ds.train_data):
        pt_text = pt_pipeline(text)
        new_text = deciphered_pipeline(text)

        # assert pt_text array equals new_text array
        accuracy += (pt_text == new_text)
        #print(pt_text, new_text)
        total_count += 1
        if i == 0:
            print(new_text)

        if (i + 1) % (len(ds.train_data) // 5) == 0:
            print(f"Iteration {i} | Accuracy: {accuracy / total_count} %.")
        if accuracy != total_count:
            print(pt_text, new_text)
            break
    try:
        assert(accuracy == total_count)
    except AssertionError:
        print("Not the same")

In [11]:
test_tokenizer(pt_tokenizer, tokenizer, train_ds)

['im', 'grabbing', 'a', 'minute', 'to', 'post', 'i', 'feel', 'greedy', 'wrong']
Iteration 1503 | Accuracy: 1.0 %.
Iteration 3007 | Accuracy: 1.0 %.
Iteration 4511 | Accuracy: 1.0 %.
Iteration 6015 | Accuracy: 1.0 %.
Iteration 7519 | Accuracy: 1.0 %.


In [12]:
print(['a', 'b', 'c'] == ['a', 'b'])
print(['a', 'b', 'c'] == ['a', 'b', 'c'])
print(['ab'] == ['a', 'b'])

False
True
False


Tokenizer looks good

Now create the vocab...

In [13]:
from torchtext.vocab import build_vocab_from_iterator

In [14]:
vocab = build_vocab_from_iterator(list(map(lambda k: tokenizer(k), [txt for txt, label in train_ds.train_data])), specials=["<unk>"])

In [15]:
vocab.set_default_index(vocab["<unk>"])

In [16]:
vocab(['great', 'day', "we're", 'having'])

[353, 96, 0, 171]

In [99]:
vocab_pipeline = lambda sentence: vocab(tokenizer(sentence))

In [102]:
print(vocab_pipeline("Hello, world!"))

[3825, 0, 191, 0]


In [103]:
import collections

In [59]:
# example input: normalized_sentence = ['im', 'grabbing', 'a', 'minute', 'to', 'post', 'i', 'feel', 'greedy', 'wrong'],
# specials = ["<unk>"]
def create_vocab(word_frequencies: dict[str, int], specials, min_freq = 1, special_first=True):
    '''
    Removes specials and puts them at the front or the beginning.
    Filters out words that do not fill min_freq requirements.
    :param word_frequencies: map of {word: freq}
    :param specials: list of ['<unk>'] specials
    :param min_freq: minimum frequency the word has to appear in our vocabulary
    :param special_first: whether specials are most common in our vocab or not.
    :return: dict of { word: freq }
    '''
    tokens = []

    if special_first:
        tokens.extend(specials)

    specials_set = set(specials)

    for word, freq in word_frequencies.items():
        if freq >= min_freq and word not in specials_set:
            tokens.append(word)

    if special_first is False:
        tokens.extend(specials)

    res = {}
    for i, token in enumerate(tokens):
        res[token] = i

    return res



def build_vocab_from_iterator_custom(normalized_sentences_list: list[list[str]], specials: list[str] = ["<unk>"]):
    '''
    Returns a map of {token: freq}. Depending on if we specify special first and min frequency we obtain a different result map.
    :param normalized_sentences_list: List of sentences that have been tokenized. For example, [['this', ',' 'sentence'], ['hello',',','world']]
    :param specials: list of specials
    :return: map of { token : freq }
    '''
    word_frequencies = {}
    for sentence in normalized_sentences_list:
        for word in sentence:
            word_frequencies[word] = word_frequencies.get(word, 0) + 1

    # sort by descending frequencies then lexicographically.
    word_frequencies = dict(sorted(word_frequencies.items(), key=lambda x: (-1 * x[1], x[0])))

    return create_vocab(word_frequencies, specials)








In [61]:
build_vocab_from_iterator_custom(list(map(lambda k: tokenizer(k), [txt for txt, label in train_ds.train_data])), specials=["<unk>"])

{'<unk>': 0,
 'i': 1,
 'feel': 2,
 'and': 3,
 'to': 4,
 'the': 5,
 'a': 6,
 'that': 7,
 'of': 8,
 'feeling': 9,
 'my': 10,
 'in': 11,
 'it': 12,
 'like': 13,
 'im': 14,
 'so': 15,
 'is': 16,
 'have': 17,
 'for': 18,
 'me': 19,
 'with': 20,
 'this': 21,
 'but': 22,
 'am': 23,
 'was': 24,
 'be': 25,
 'not': 26,
 'as': 27,
 'about': 28,
 'on': 29,
 'you': 30,
 'at': 31,
 'more': 32,
 'just': 33,
 'when': 34,
 'or': 35,
 'all': 36,
 'because': 37,
 'do': 38,
 'can': 39,
 'are': 40,
 'very': 41,
 'really': 42,
 'up': 43,
 'time': 44,
 't': 45,
 'out': 46,
 'if': 47,
 'been': 48,
 'get': 49,
 'what': 50,
 'now': 51,
 'they': 52,
 'know': 53,
 'myself': 54,
 'how': 55,
 'will': 56,
 'by': 57,
 'from': 58,
 'had': 59,
 'some': 60,
 'them': 61,
 'being': 62,
 'people': 63,
 'want': 64,
 'little': 65,
 'would': 66,
 'her': 67,
 'an': 68,
 'make': 69,
 'think': 70,
 'its': 71,
 'he': 72,
 'one': 73,
 'even': 74,
 'there': 75,
 'who': 76,
 'something': 77,
 'him': 78,
 'we': 79,
 'life': 80,
 'goi

In [65]:
def build_vocab_from_tokenized_sentences_optimized(normalized_sentences_list: list[list[str]], specials: list[str] = ["<unk>"]):
    word_frequencies = {}
    specials_set = set(specials)
    max_freq = -1
    for sentence in normalized_sentences_list:
        for word in sentence:
            if word not in specials_set:
                word_frequencies[word] = word_frequencies.get(word, 0) + 1
                max_freq = max(max_freq, word_frequencies[word])
    biggest_freq_after = max_freq + 1
    for special in specials:
        word_frequencies[special] = biggest_freq_after
        biggest_freq_after += 1

    word_frequencies_sorted = {k: v for k, v in sorted(word_frequencies.items(), key=lambda x: (-x[1], x[0]))}
    res = {}
    for i, (word, freq) in enumerate(word_frequencies_sorted.items()):
        res[word] = i
    return res

In [66]:
build_vocab_from_tokenized_sentences_optimized(list(map(lambda k: tokenizer(k), [txt for txt, label in train_ds.train_data])), specials=["<unk>"])

{'<unk>': 0,
 'i': 1,
 'feel': 2,
 'and': 3,
 'to': 4,
 'the': 5,
 'a': 6,
 'that': 7,
 'of': 8,
 'feeling': 9,
 'my': 10,
 'in': 11,
 'it': 12,
 'like': 13,
 'im': 14,
 'so': 15,
 'is': 16,
 'have': 17,
 'for': 18,
 'me': 19,
 'with': 20,
 'this': 21,
 'but': 22,
 'am': 23,
 'was': 24,
 'be': 25,
 'not': 26,
 'as': 27,
 'about': 28,
 'on': 29,
 'you': 30,
 'at': 31,
 'more': 32,
 'just': 33,
 'when': 34,
 'or': 35,
 'all': 36,
 'because': 37,
 'do': 38,
 'can': 39,
 'are': 40,
 'very': 41,
 'really': 42,
 'up': 43,
 'time': 44,
 't': 45,
 'out': 46,
 'if': 47,
 'been': 48,
 'get': 49,
 'what': 50,
 'now': 51,
 'they': 52,
 'know': 53,
 'myself': 54,
 'how': 55,
 'will': 56,
 'by': 57,
 'from': 58,
 'had': 59,
 'some': 60,
 'them': 61,
 'being': 62,
 'people': 63,
 'want': 64,
 'little': 65,
 'would': 66,
 'her': 67,
 'an': 68,
 'make': 69,
 'think': 70,
 'its': 71,
 'he': 72,
 'one': 73,
 'even': 74,
 'there': 75,
 'who': 76,
 'something': 77,
 'him': 78,
 'we': 79,
 'life': 80,
 'goi

Created our functions, let's compare them all, some repetition here

In [68]:
from torchtext.vocab import build_vocab_from_iterator

vocab_from_factory = build_vocab_from_iterator(list(map(lambda k: tokenizer(k), [txt for txt, label in train_ds.train_data])),
                                  specials=["<unk>"])
vocab_from_factory.set_default_index(vocab["<unk>"])
vocab_from_factory(['great', 'day', "we're", 'having'])

[353, 96, 0, 171]

In [93]:
vocab_from_custom_detailed = build_vocab_from_iterator_custom(list(map(lambda k: tokenizer(k), [txt for txt, label in train_ds.train_data])))
def vocab_from_custom_lambda(vocab_map, tokenized):
    res = []
    for token in tokenized:
        res.append(vocab_map.get(token, 0))
    return res
vocab_from_custom_detailed_pipeline = lambda sentence: vocab_from_custom_lambda(vocab_from_custom_detailed, tokenizer(sentence))

In [94]:
vocab_from_custom_detailed_pipeline("Hello, world!")

[3825, 0, 191, 0]

In [95]:
vocab_from_custom_optimized = build_vocab_from_tokenized_sentences_optimized(list(map(lambda k: _basic_english_normalize(k), [txt for txt, label in train_ds.train_data])))
vocab_from_custom_optimized_pipeline = lambda sentence: vocab_from_custom_lambda(vocab_from_custom_optimized, tokenizer(sentence))

In [96]:
vocab_from_custom_optimized_pipeline("Hello, world!")

[3825, 0, 191, 0]

In [90]:
#Create a test method in similar syntax to java
def test_vocab(pt_pipeline, deciphered_pipeline, ds):
    accuracy = 0
    total_count = 0
    for i, (text, label) in enumerate(ds.train_data):
        pt_text = pt_pipeline(text)
        new_text = deciphered_pipeline(text)

        # assert pt_text array equals new_text array
        accuracy += (pt_text == new_text)
        #print(pt_text, new_text)
        total_count += 1
        if i == 0:
            print(new_text)

        if (i + 1) % (len(ds.train_data) // 5) == 0:
            print(f"Iteration {i} | Accuracy: {accuracy / total_count} %.")
        if accuracy != total_count:
            print(pt_text, new_text)
            break
    try:
        assert(accuracy == total_count)
    except AssertionError:
        print("Not the same")

In [97]:
test_vocab(vocab_from_custom_optimized_pipeline, vocab_from_custom_detailed_pipeline, train_ds)

[14, 6945, 6, 1033, 4, 378, 1, 2, 263, 553]
Iteration 1503 | Accuracy: 1.0 %.
Iteration 3007 | Accuracy: 1.0 %.
Iteration 4511 | Accuracy: 1.0 %.
Iteration 6015 | Accuracy: 1.0 %.
Iteration 7519 | Accuracy: 1.0 %.


In [104]:
test_vocab(vocab_pipeline, vocab_from_custom_optimized_pipeline, train_ds)

[14, 6945, 6, 1033, 4, 378, 1, 2, 263, 553]
Iteration 1503 | Accuracy: 1.0 %.
Iteration 3007 | Accuracy: 1.0 %.
Iteration 4511 | Accuracy: 1.0 %.
Iteration 6015 | Accuracy: 1.0 %.
Iteration 7519 | Accuracy: 1.0 %.
