<a href="https://colab.research.google.com/github/tsanzxc456/NLP/blob/master/lab3_0760054.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1
Build a 2-gram model for the Twitter train data:

In [27]:
import re, math
from collections import Counter, defaultdict
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer

## Fetching train data
First download the train tweet data, then tokenize the data converted to lowercase.

Take frequent terms which appearing more than 3 times as vocabulary, and replace the infrequent term as 'UNK'.

In [28]:
tweet_tokenizer = TweetTokenizer()
corpus = pd.read_json("https://raw.githubusercontent.com/bshmueli/108-nlp/master/tweets_train.txt",lines=True).values.tolist()
corpus_vocab = Counter()
for i in range(0,len(corpus)):
  corpus[i] =  ['<s>'] + tweet_tokenizer.tokenize(corpus[i][0].lower()) + ['</s>']
  corpus_vocab.update(corpus[i])

for word in list(corpus_vocab):
  if corpus_vocab[word] < 2:
    del corpus_vocab[word]

for i in range(0,len(corpus)):
  corpus[i] = [(token if token in corpus_vocab else '<UNK>') for token in corpus[i]]

## Building bigrams for train data

In [29]:
corpus_bigrams_counts = defaultdict(lambda: defaultdict(lambda: 0))
corpus_bigrams = []
for i in range(0,len(corpus)):
  corpus_bigrams += list(nltk.bigrams(corpus[i]))
  for w1,w2 in nltk.bigrams(corpus[i]):
    corpus_bigrams_counts[w1][w2] += 1
# corpus_bigrams_counts

## Fetching test data

In [30]:
test_corpus = pd.read_json("https://raw.githubusercontent.com/bshmueli/108-nlp/master/tweets_test.txt",lines=True).values.tolist()
for i in range(0,len(test_corpus)):
  test_corpus[i] =  ['<s>'] + tweet_tokenizer.tokenize(test_corpus[i][0].lower()) + ['</s>']
  test_corpus[i] = [(token if token in corpus_vocab else '<UNK>') for token in test_corpus[i]]

## Building bigrams for test data

In [31]:
test_bigrams = []
for i in range(0,len(test_corpus)):
  test_bigrams += list(nltk.bigrams(test_corpus[i]))

## Computing the perplexity

In [32]:
probabilities_train = [(1 + corpus_bigrams_counts[w1][w2])/(len(corpus_vocab) + sum(corpus_bigrams_counts[w1].values())) for w1, w2 in corpus_bigrams]
N = len(corpus_bigrams)
cross_entropy_train =-1/N * sum([math.log(p, 2) for p in probabilities_train])
print('Average perplexity for train data tweet is {:.3f}'.format(math.pow(2, cross_entropy_train)))
print('---')
probabilities_test = [(1 + corpus_bigrams_counts[w1][w2])/(len(corpus_vocab) + sum(corpus_bigrams_counts[w1].values())) for w1, w2 in test_bigrams]
N = len(test_bigrams)
cross_entropy_test =-1/N * sum([math.log(p, 2) for p in probabilities_test])
print('Average perplexity for test data tweet is {:.3f}'.format(math.pow(2, cross_entropy_test)))

Average perplexity for train data tweet is 1387.291
---
Average perplexity for test data tweet is 1605.826


# Part 2
Build a bi-directional 2-gram model by training on the Twitter train data:

In [33]:
import re, math
from collections import Counter, defaultdict
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer

## Fetching train data
First download the train tweet data, then tokenize the data converted to lowercase.

Take frequent terms which appearing more than 3 times as vocabulary, and replace the infrequent term as 'UNK'.

In [34]:
tweet_tokenizer = TweetTokenizer()
forward_corpus = pd.read_json("https://raw.githubusercontent.com/bshmueli/108-nlp/master/tweets_train.txt",lines=True).values.tolist()
corpus_vocab = Counter()
for i in range(0,len(forward_corpus)):
  forward_corpus[i] =  ['<s>'] + tweet_tokenizer.tokenize(forward_corpus[i][0].lower()) + ['</s>']
  corpus_vocab.update(forward_corpus[i])

for word in list(corpus_vocab):
  if corpus_vocab[word] < 2:
    del corpus_vocab[word]

for i in range(0,len(forward_corpus)):
  forward_corpus[i] = [(token if token in corpus_vocab else '<UNK>') for token in forward_corpus[i]]

backward_corpus = [forward_corpus[i][::-1] for i in range(0,len(forward_corpus))]

## Building bigrams for train data with both directions.

In [35]:
forward_corpus_bigrams_counts = defaultdict(lambda: defaultdict(lambda: 0))
forward_corpus_bigrams = []
for i in range(0,len(forward_corpus)):
  forward_corpus_bigrams += list(nltk.bigrams(forward_corpus[i]))
  for w1,w2 in nltk.bigrams(forward_corpus[i]):
    forward_corpus_bigrams_counts[w1][w2] += 1

backward_corpus_bigrams_counts = defaultdict(lambda: defaultdict(lambda: 0))
backward_corpus_bigrams = []
for i in range(0,len(backward_corpus)):
  backward_corpus_bigrams += list(nltk.bigrams(backward_corpus[i]))
  for w1,w2 in nltk.bigrams(backward_corpus[i]):
    backward_corpus_bigrams_counts[w1][w2] += 1

## Fetching test data

In [36]:
test_corpus = pd.read_json("https://raw.githubusercontent.com/bshmueli/108-nlp/master/tweets_test.txt",lines=True).values.tolist()
for i in range(0,len(test_corpus)):
  test_corpus[i] =  ['<s>'] + tweet_tokenizer.tokenize(test_corpus[i][0].lower()) + ['</s>']
  test_corpus[i] = [(token if token in corpus_vocab else '<UNK>') for token in test_corpus[i]]

## Building bigrams for test data

In [37]:
test_bigrams = []
for i in range(0,len(test_corpus)):
  test_bigrams += list(nltk.bigrams(test_corpus[i]))

## Computing the perplexity

In [38]:
forward_probabilities_test = [(1 + forward_corpus_bigrams_counts[w1][w2])/(len(corpus_vocab) + sum(forward_corpus_bigrams_counts[w1].values())) for w1, w2 in test_bigrams]
backward_probabilities_test = [(1 + backward_corpus_bigrams_counts[w1][w2])/(len(corpus_vocab) + sum(backward_corpus_bigrams_counts[w1].values())) for w1, w2 in test_bigrams]
N = len(test_bigrams)
for r in range(0,105,5):
  r_factor = r/100
  mix_probabilities_test = [(forward_probabilities_test[i]*r_factor + backward_probabilities_test[i]*(1-r_factor)) for i in range(0,len(forward_probabilities_test))]
  cross_entropy_test =-1/N * sum([math.log(p, 2) for p in mix_probabilities_test])
  print('Average perplexity with r = {:.2f} is {:.3f}'.format(r_factor,math.pow(2, cross_entropy_test)))

Average perplexity with r = 0.00 is 14187.501
Average perplexity with r = 0.05 is 5985.706
Average perplexity with r = 0.10 is 4670.248
Average perplexity with r = 0.15 is 3960.347
Average perplexity with r = 0.20 is 3493.070
Average perplexity with r = 0.25 is 3154.291
Average perplexity with r = 0.30 is 2893.944
Average perplexity with r = 0.35 is 2685.874
Average perplexity with r = 0.40 is 2514.828
Average perplexity with r = 0.45 is 2371.202
Average perplexity with r = 0.50 is 2248.598
Average perplexity with r = 0.55 is 2142.569
Average perplexity with r = 0.60 is 2049.918
Average perplexity with r = 0.65 is 1968.292
Average perplexity with r = 0.70 is 1895.921
Average perplexity with r = 0.75 is 1831.472
Average perplexity with r = 0.80 is 1773.942
Average perplexity with r = 0.85 is 1722.617
Average perplexity with r = 0.90 is 1677.087
Average perplexity with r = 0.95 is 1637.421
Average perplexity with r = 1.00 is 1605.826


In [39]:
print('r = 1.00 minimize the perplexity of the Twitter test data')
r_factor = 1.00
mix_probabilities_test = [(forward_probabilities_test[i]*r_factor + backward_probabilities_test[i]*(1-r_factor)) for i in range(0,len(forward_probabilities_test))]
cross_entropy_test =-1/N * sum([math.log(p, 2) for p in mix_probabilities_test])
print('Average perplexity for test data tweet is {:.3f}'.format(math.pow(2, cross_entropy_test)))


r = 1.00 minimize the perplexity of the Twitter test data
Average perplexity for test data tweet is 1605.826


In [40]:
forward_probabilities_train = [(1 + forward_corpus_bigrams_counts[w1][w2])/(len(corpus_vocab) + sum(forward_corpus_bigrams_counts[w1].values())) for w1, w2 in forward_corpus_bigrams]
backward_probabilities_train = [(1 + backward_corpus_bigrams_counts[w1][w2])/(len(corpus_vocab) + sum(backward_corpus_bigrams_counts[w1].values())) for w1, w2 in forward_corpus_bigrams]
mix_probabilities_train = [(forward_probabilities_train[i]*r_factor + backward_probabilities_train[i]*(1-r_factor)) for i in range(0,len(forward_probabilities_train))]
N = len(forward_corpus_bigrams)
cross_entropy_train =-1/N * sum([math.log(p, 2) for p in mix_probabilities_train])
print('Average perplexity for train data tweet is {:.3f}'.format(math.pow(2, cross_entropy_train)))

Average perplexity for train data tweet is 1387.291
