In [0]:
import re, math
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Part 1

In [0]:
# Read JSON Line & make a copy
train = pd.read_json("https://bit.ly/nlp-tweet-train", lines=True)
test = pd.read_json("https://bit.ly/nlp-tweet-test", lines=True)
train_app = train.copy()
test_app = test.copy()

### Build a 2-gram model for the Twitter train data (add the EOS tokens)

In [0]:
train_app["text_proc"] = train_app['text'].apply(lambda x: "<s> "+x+" </s>")
test_app["text_proc"] = test_app['text'].apply(lambda x: "<s> "+x+" </s>")

### First convert to lowercase, then use the NLTK TweetTokenizer.

In [0]:
tknzr = TweetTokenizer()
corpus = [token for sentence in train_app['text_proc'] for token in tknzr.tokenize(sentence.lower()) if token != '']

### Vocabulary: Find the freqency of the words

In [0]:
vocab = Counter(corpus)

In [0]:
vocab.most_common(5)

[('<s>', 132599),
 ('</s>', 132599),
 ('.', 116369),
 ('the', 81714),
 ('i', 73782)]

In [0]:
unk_vocab = set([token for token, freq in vocab.most_common() if freq < 3])

In [0]:
len(unk_vocab)

61330

#### Replace infrequent terms (appearing only 1 or 2 times in the train data) by UNK tokens in both train and test data

In [0]:
test_app['text_proc'] = test_app['text_proc'] .apply(lambda x: tknzr.tokenize(x.lower()))
train_app['text_proc'] = train_app['text_proc'].apply(lambda x: tknzr.tokenize(x.lower()))

In [0]:
def UNK_token(sent, token_set):
    for idx, token in enumerate(sent):
        if token in token_set:
            sent[idx] = "<UNK>"
    return sent

In [0]:
train_app['text_proc'] = train_app['text_proc'].apply(UNK_token, args=(unk_vocab,))

In [0]:
# UNK based on train data
test_app['text_proc'] = test_app['text_proc'].apply(UNK_token, args=(unk_vocab,))

### Create a Bigram Dictionary

In [0]:
def bigrams_dict(sent, reverse=False):
    if reverse == False:
        for w1, w2 in nltk.bigrams(sent):
            counts_train[w1][w2] += 1
    else:
        for w1, w2 in nltk.bigrams(sent):
            counts_reverse[w1][w2] += 1

In [0]:
counts_train = defaultdict(lambda: defaultdict(lambda: 0))
tmp = train_app['text_proc'].apply(bigrams_dict)

### Use NLTK to Bigrams

In [0]:
train_app["text_bigrams"] = train_app['text_proc'].apply(lambda x: list(nltk.bigrams(x)))
test_app["text_bigrams"] = test_app['text_proc'].apply(lambda x: list(nltk.bigrams(x)))

In [0]:
new_corpus = [token for sentence in train_app['text_proc'] for token in sentence if token != '']

In [0]:
new_vocab = Counter(new_corpus)

In [0]:
len(new_vocab)

29052

#### Use Laplace smoothing 

In [0]:
def probabilities(sent, reverse=False):
    if reverse == False:
        numerator = np.array([1 + counts_train[w1][w2] for w1, w2 in sent])
        denumerator = np.array([len(new_vocab) + sum(counts_train[w1].values()) for w1, w2 in sent])
        return numerator / denumerator
    else:
        numerator = np.array([1 + counts_reverse[w1][w2] for w1, w2 in sent])
        denumerator = np.array([len(new_vocab) + sum(counts_reverse[w1].values()) for w1, w2 in sent])
        return numerator / denumerator
    
def cross_entropy(probabilities):
    N = probabilities.shape[0]
    cross_entropy =-1/N * sum([math.log(p, 2) for p in probabilities])
    return math.pow(2, cross_entropy)

In [0]:
tqdm.pandas()
train_app['probabilities'] = train_app['text_bigrams'].progress_apply(probabilities)

HBox(children=(FloatProgress(value=0.0, max=132599.0), HTML(value='')))




In [0]:
train_app["Perplexity"] = train_app['probabilities'].progress_apply(cross_entropy)

HBox(children=(FloatProgress(value=0.0, max=132599.0), HTML(value='')))




In [0]:
tqdm.pandas()
test_app['probabilities'] = test_app['text_bigrams'].progress_apply(probabilities)

HBox(children=(FloatProgress(value=0.0, max=33137.0), HTML(value='')))




In [0]:
test_app["Perplexity"] = test_app['probabilities'].progress_apply(cross_entropy)

HBox(children=(FloatProgress(value=0.0, max=33137.0), HTML(value='')))




### Average perplexity for:
* Train data tweets

In [0]:
train_app["Perplexity"].mean()

1355.3697021213343

* Test data tweets

In [0]:
test_app["Perplexity"].mean()

2041.1361590471279

## Part 2

#### Build 2-gram forward model (identical to part 1)

### Build 2-gram backward model (identical to part 1, but in the opposite direction)

In [0]:
train_app["text_reverse"] = train_app["text_proc"].apply(lambda x: x[::-1])

In [0]:
counts_reverse = defaultdict(lambda: defaultdict(lambda: 0))
tmp = train_app['text_reverse'].apply(bigrams_dict, args=(True,))

In [0]:
train_app["text_reverse_bigrams"] = train_app['text_reverse'].apply(lambda x: list(nltk.bigrams(x)))

In [0]:
train_app['reverse_probabilities'] = train_app['text_reverse_bigrams'].progress_apply(probabilities, args=(True,))
train_app["reverse_Perplexity"] = train_app['reverse_probabilities'].progress_apply(cross_entropy)

HBox(children=(FloatProgress(value=0.0, max=132599.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=132599.0), HTML(value='')))




In [0]:
tqdm.pandas()
# text_reverse	text_reverse_bigrams	reverse_probabilities	reverse_Perplexit
test_app["text_reverse"] = test_app["text_proc"].apply(lambda x: x[::-1])

In [0]:
test_app["text_reverse_bigrams"] = test_app['text_reverse'].apply(lambda x: list(nltk.bigrams(x)))

In [0]:
test_app['reverse_probabilities'] = test_app['text_reverse_bigrams'].progress_apply(probabilities, args=(True,))
test_app["reverse_Perplexity"] = test_app['reverse_probabilities'].progress_apply(cross_entropy)

HBox(children=(FloatProgress(value=0.0, max=33137.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33137.0), HTML(value='')))




### Calculate prob & perplexity of the reverse bigrams

In [0]:
def bi_directional(probabilities, inv_probabilities, gamma):
    result_perplexity = []
    for idx, row in enumerate(probabilities):
        result_prob = []
        N = len(row)
        for i in range(N-1):
            result_p = gamma * row[i] + (1-gamma) *inv_probabilities[idx][N-i-2]
            result_prob.append(result_p)
        row_perplexity = cross_entropy(np.array(result_prob))
        result_perplexity.append(row_perplexity)
        
    return result_perplexity

In [0]:
for alpha in tqdm(np.linspace(0, 1, 21)):
    train_app["Perplexity_"+str("%3.2f" %alpha)] = pd.DataFrame(bi_directional(train_app['probabilities'], train_app["reverse_probabilities"], alpha))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [0]:
for alpha in tqdm(np.linspace(0, 1, 21)):
    print(str("%3.2f" %alpha)+":", train_app["Perplexity_"+str("%3.2f" %alpha)].mean())

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

0.00: 1545.95720210846
0.05: 1276.2384229192257
0.10: 1179.72469722004
0.15: 1120.1225703087325
0.20: 1078.891885661185
0.25: 1049.06350171761
0.30: 1027.2394355292433
0.35: 1011.5384657885714
0.40: 1000.8551050368553
0.45: 994.5401030176818
0.50: 992.250241229899
0.55: 993.8799335097638
0.60: 999.5422427807257
0.65: 1009.5912957736026
0.70: 1024.6959002282638
0.75: 1046.0001223782247
0.80: 1075.4672333721296
0.85: 1116.6887326678861
0.90: 1177.177010398242
0.95: 1277.6357665933601
1.00: 1603.1526315522567



In [0]:
Perplexity_mean = [train_app["Perplexity_"+str("%3.2f" %alpha)].mean() for alpha in tqdm(np.linspace(0, 1, 21))]

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [0]:
np.linspace(0, 1, 21)[Perplexity_mean.index(min(Perplexity_mean))]

0.5

### Print the 𝛾 that minimizes the perplexity of the Twitter test data

In [0]:
for alpha in tqdm(np.linspace(0, 1, 21)):
    test_app["Perplexity_"+str("%3.2f" %alpha)] = pd.DataFrame(bi_directional(test_app['probabilities'], test_app["reverse_probabilities"], alpha))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [0]:
for alpha in tqdm(np.linspace(0, 1, 21)):
    print(str("%3.2f" %alpha)+":", test_app["Perplexity_"+str("%3.2f" %alpha)].mean())

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

0.00: 2350.2065352065138
0.05: 1875.3105805579855
0.10: 1721.5539640314175
0.15: 1628.3063958473201
0.20: 1564.39639075968
0.25: 1518.3964463972432
0.30: 1484.8152561495144
0.35: 1460.6409345337725
0.40: 1444.1133387794969
0.45: 1434.2002415327931
0.50: 1430.35256132534
0.55: 1432.3928159659429
0.60: 1440.4830127083947
0.65: 1455.1582999120012
0.70: 1477.441539633898
0.75: 1509.0956558659052
0.80: 1553.1684830012514
0.85: 1615.2865752138116
0.90: 1707.369653000141
0.95: 1862.9690593130515
1.00: 2424.8250446421875



In [0]:
Perplexity_test_mean = [test_app["Perplexity_"+str("%3.2f" %alpha)].mean() for alpha in tqdm(np.linspace(0, 1, 21))]

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




## 𝛾

In [0]:
print(np.linspace(0, 1, 21)[Perplexity_test_mean.index(min(Perplexity_test_mean))])

0.5


### Average perplexity (at the optimal 𝛾) for:

* Training data tweets

In [0]:
print(train_app["Perplexity_0.50"].mean())

992.250241229899


* Testing data tweets

In [0]:
print(test_app["Perplexity_0.50"].mean())

1430.35256132534
