In [0]:
import re, math
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd

# Part1:  Build a  2-gram model for the Twitter training data

## Fetching the training and testing Corpus
`get_corpus()` reads the JSON file, and then return training and testing data in dataframe format

In [0]:
def get_corpus():
    df_train = pd.read_json('https://raw.githubusercontent.com/bshmueli/108-nlp/master/tweets_train.txt', lines=True)
    df_test = pd.read_json('https://raw.githubusercontent.com/bshmueli/108-nlp/master/tweets_test.txt', lines=True)
    print("Dataset training columns", df_train.columns)
    print("Dataset training size", len(df_train))
    print("Dataset testing columns", df_test.columns)
    print("Dataset testing size", len(df_test))
    return df_train, df_test

## Perfomring tokenizer

In [0]:
def tokenize(document):
    tweet_tokenizer = TweetTokenizer()
    return tweet_tokenizer.tokenize(document.lower())

## Count frequencies of vocabulary

In [0]:
def count_vocab(corpus_tokenize):
    vocabulary = Counter()
    for document_tokenize in corpus_tokenize:
        vocabulary.update(document_tokenize)
    return vocabulary

## Get tokenize UNK corpus

In [0]:
def get_test_unk(test_tokenize, valid_vocab):
    test_tokenize_copy = test_tokenize.copy()
    corpus_tokenize_unk = []
    for document_tokenize in test_tokenize_copy:
        for token_id, token in enumerate(document_tokenize):
            if token not in valid_vocab:
                document_tokenize[token_id] = '<UNK>'
        corpus_tokenize_unk.append(document_tokenize)
    return corpus_tokenize_unk

## Computing bigram frequencies
`get_ngram(corpus)` computes the bigram frequencies.

In [0]:
def get_ngram(corpus_tokenize):
    vocabulary = defaultdict(lambda: defaultdict(lambda: 0))
    for document_tokenize in corpus_tokenize:
        twograms = nltk.ngrams(document_tokenize, 2)
        for w1, w2 in twograms:
            vocabulary[w1][w2] += 1
    return vocabulary

## Compute perplexity

In [0]:
def compute_perplexity(corpus_tokenize_unk, bigram_model, V):
    perplexity = 0
    for document_id, document_tokenize in enumerate(corpus_tokenize_unk):
        N = len(list(nltk.ngrams(document_tokenize, 2)))
        probabilities = []
        for w1, w2 in nltk.ngrams(document_tokenize, 2):
            numerator = 1 + bigram_model[w1][w2]
            denominator = V + sum(bigram_model[w1].values())
            probabilities.append(numerator / denominator)
        cross_entropy = -1 / N * sum([math.log(p, 2) for p in probabilities])
        perplexity += math.pow(2, cross_entropy)
        if document_id % 20000 == 0:
            print("NOW IS: {}".format(document_id))
            print("cross entropy: {}".format(cross_entropy))
            print("perplexity: {}".format(math.pow(2, cross_entropy)))

    perplexity /= len(corpus_tokenize_unk)
    return perplexity

## Main program

In [8]:
df_train, df_test = get_corpus()
df_train_tokenize = [['<s>'] + tokenize(document) + ['</s>'] for document in df_train['text']]
df_test_tokenize = [['<s>'] + tokenize(document) + ['</s>'] for document in df_test['text']]

Dataset training columns Index(['text'], dtype='object')
Dataset training size 132599
Dataset testing columns Index(['text'], dtype='object')
Dataset testing size 33137


In [0]:
THRESHOLD = 3
train_vocab = count_vocab(df_train_tokenize)
# train_rare_vocab = [key for key, value in train_vocab.items() if value < THRESHOLD]
train_valid_vocab = [key for key, value in train_vocab.items() if value >= THRESHOLD]
df_train_tokenize_unk = get_test_unk(df_train_tokenize, train_valid_vocab)
test_vocab = count_vocab(df_test_tokenize)
df_test_tokenize_unk = get_test_unk(df_test_tokenize, train_valid_vocab)

In [0]:
bigram_model = get_ngram(df_train_tokenize_unk)

In [11]:
V_train = len(train_valid_vocab)
training_perplexity = compute_perplexity(df_train_tokenize_unk, bigram_model, V_train)
print()
print("Training perplexity is: {}".format(training_perplexity))

NOW IS: 0
cross entropy: 11.788556698852915
perplexity: 3537.603638698599
NOW IS: 20000
cross entropy: 10.972909799373516
perplexity: 2009.9024950967184
NOW IS: 40000
cross entropy: 10.255117296199366
perplexity: 1222.075155849423
NOW IS: 60000
cross entropy: 8.943855448675427
perplexity: 492.45751236696424
NOW IS: 80000
cross entropy: 9.625676603888536
perplexity: 789.9823464670678
NOW IS: 100000
cross entropy: 8.61114157941023
perplexity: 391.0316516305096
NOW IS: 120000
cross entropy: 9.801505600818954
perplexity: 892.3745757896074

Training perplexity is: 1355.335648828969


In [12]:
testing_perplexity = compute_perplexity(df_test_tokenize_unk, bigram_model, V_train)
print()
print("Testing perplexity is: {}".format(testing_perplexity))

NOW IS: 0
cross entropy: 11.287989994561682
perplexity: 2500.4811164461944
NOW IS: 20000
cross entropy: 10.731215578398482
perplexity: 1699.8781009466716

Testing perplexity is: 1640.5500188061494


# Part2: Build a *bi-directional* 2-gram model by training on the Twitter training data

## Computing bigram frequencies
`get_ngram_part2(corpus)` computes the bigram frequencies.

In [0]:
def get_ngram_part2(corpus_tokenize):
    vocabulary_inverse = defaultdict(lambda: defaultdict(lambda: 0))
    for document_tokenize in corpus_tokenize:
        twograms_inverse = nltk.ngrams(document_tokenize, 2)
        for w1, w2 in twograms_inverse:
            vocabulary_inverse[w1][w2] += 1
    return vocabulary_inverse

## Compute perplexity of linear combination of bi-directional

In [0]:
def compute_perplexity_part2(corpus_tokenize_unk, bigram_model, bigram_model_inverse, V, step):
    perplexity = 0
    for document_id, document_tokenize in enumerate(corpus_tokenize_unk):
        N = len(list(nltk.ngrams(document_tokenize, 2)))
        probabilities = []
        document_tokenize_copy = document_tokenize.copy()
        for token_id, token in enumerate(document_tokenize):
            if token == '<s>' or token == '</s>':
                continue
            else:
                numerator_forward = 1 + bigram_model[document_tokenize_copy[token_id-1]][token]
                denominator_forward = V + sum(bigram_model[document_tokenize_copy[token_id-1]].values())
                numerator_backward = 1 + bigram_model_inverse[document_tokenize_copy[token_id+1]][token]
                denominator_backward = V + sum(bigram_model_inverse[document_tokenize_copy[token_id+1]].values())
                probability = step * (numerator_forward / denominator_forward) + (1 - step) * (numerator_backward / denominator_backward)
                probabilities.append(probability)                
            
        cross_entropy = -1 / N * sum([math.log(p, 2) for p in probabilities])
        perplexity += math.pow(2, cross_entropy)
    perplexity /= len(corpus_tokenize_unk)
    return perplexity

## Main program

In [15]:
GAMMA = np.arange(0.0, 1.0, 0.05)
GAMMA

array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])

In [0]:
df_train_tokenize_unk_inverse = [document[::-1] for document in df_train_tokenize_unk]

In [0]:
bigram_model_inverse = get_ngram(df_train_tokenize_unk_inverse)

In [0]:
best_gamma = 0
best_training_perplexity = 0
best_testing_perplexity = 1e7

In [19]:
for gamma in GAMMA:
    print("NOW is: {}".format(gamma))
    training_perplexity = compute_perplexity_part2(df_train_tokenize_unk, bigram_model, bigram_model_inverse, V_train, gamma)
    testing_perplexity = compute_perplexity_part2(df_test_tokenize_unk, bigram_model, bigram_model_inverse, V_train, gamma)
    if testing_perplexity <= best_testing_perplexity:
        best_gamma = gamma
        best_training_perplexity = training_perplexity
        best_testing_perplexity = testing_perplexity
        print("best gamma: {}".format(best_gamma))
    print("training perplexity: {}".format(training_perplexity))
    print("testing perplexity: {}".format(testing_perplexity))

NOW is: 0.0
best gamma: 0.0
training perplexity: 1001.4953832652383
testing perplexity: 1219.236398266508
NOW is: 0.05
best gamma: 0.05
training perplexity: 832.5586991748836
testing perplexity: 973.4204425041922
NOW is: 0.1
best gamma: 0.1
training perplexity: 771.9906794150176
testing perplexity: 893.4705837491047
NOW is: 0.15000000000000002
best gamma: 0.15000000000000002
training perplexity: 734.5527346243008
testing perplexity: 845.1529361185156
NOW is: 0.2
best gamma: 0.2
training perplexity: 708.6434475185696
testing perplexity: 812.1590047277458
NOW is: 0.25
best gamma: 0.25
training perplexity: 689.8998470099566
testing perplexity: 788.5124504418635
NOW is: 0.30000000000000004
best gamma: 0.30000000000000004
training perplexity: 676.1940179736641
testing perplexity: 771.3455403204982
NOW is: 0.35000000000000003
best gamma: 0.35000000000000003
training perplexity: 666.3477657766954
testing perplexity: 759.0886191161483
NOW is: 0.4
best gamma: 0.4
training perplexity: 659.668899

In [20]:
print("best gamma: {}".format(best_gamma))
print("best training perplexity: {}".format(best_training_perplexity))
print("best testing perplexity: {}".format(best_testing_perplexity))

best gamma: 0.5
best training perplexity: 654.3808701553082
best testing perplexity: 744.4107434171606
