# Lab 1 - Text Preprocessing and N-Gram LM

In [1]:
from tqdm import tqdm

In [2]:
text_1 = "That U.S.A. poster-print costs $12.40..."
text_2 = "Hope, is the only thing stronger than fear! Hunger Games #Hope"
tweet = "Don't take cryptocurrency advice from people on twitter. 😅👌"

## Text Tokenization

### NLTK

In [3]:
import nltk

In [4]:
from nltk.tokenize import RegexpTokenizer, TreebankWordTokenizer, WordPunctTokenizer 
from nltk.tokenize import WhitespaceTokenizer, TweetTokenizer, MWETokenizer, word_tokenize

In [5]:
tokenizer = RegexpTokenizer('(?:[A-Z]\.)+|\\w-w+|\$[\d\.]+|\S+')

In [6]:
print('Regular expression text_1: ',tokenizer.tokenize(text_1))
print('\nRegular expression: text_2',tokenizer.tokenize(text_2))
print('\nRegular expression tweet: ',tokenizer.tokenize(tweet))

Regular expression text_1:  ['That', 'U.S.A.', 'poster-print', 'costs', '$12.40...']

Regular expression: text_2 ['Hope,', 'is', 'the', 'only', 'thing', 'stronger', 'than', 'fear!', 'Hunger', 'Games', '#Hope']

Regular expression tweet:  ["Don't", 'take', 'cryptocurrency', 'advice', 'from', 'people', 'on', 'twitter.', '😅👌']


In [7]:
tk_TreeBank = TreebankWordTokenizer()

In [8]:
print('TreeBank text_1:\n',tk_TreeBank.tokenize(text_1))
print('\nTreeBank text_2:\n',tk_TreeBank.tokenize(text_2))
print('\nTreeBank tweet:\n',tk_TreeBank.tokenize(tweet))

TreeBank text_1:
 ['That', 'U.S.A.', 'poster-print', 'costs', '$', '12.40', '...']

TreeBank text_2:
 ['Hope', ',', 'is', 'the', 'only', 'thing', 'stronger', 'than', 'fear', '!', 'Hunger', 'Games', '#', 'Hope']

TreeBank tweet:
 ['Do', "n't", 'take', 'cryptocurrency', 'advice', 'from', 'people', 'on', 'twitter.', '😅👌']


In [9]:
tk_WordPunct = WordPunctTokenizer()

In [10]:
print('WordPunct text_1:\n',tk_WordPunct.tokenize(text_1))
print('\nWordPunct text_2:\n',tk_WordPunct.tokenize(text_2))
print('\nWordPunct tweet:\n',tk_WordPunct.tokenize(tweet))

WordPunct text_1:
 ['That', 'U', '.', 'S', '.', 'A', '.', 'poster', '-', 'print', 'costs', '$', '12', '.', '40', '...']

WordPunct text_2:
 ['Hope', ',', 'is', 'the', 'only', 'thing', 'stronger', 'than', 'fear', '!', 'Hunger', 'Games', '#', 'Hope']

WordPunct tweet:
 ['Don', "'", 't', 'take', 'cryptocurrency', 'advice', 'from', 'people', 'on', 'twitter', '.', '😅👌']


In [11]:
tk_WhiteSpace = WhitespaceTokenizer()

In [12]:
print('WhiteSpace text_1:\n',tk_WhiteSpace.tokenize(text_1))
print('\nWhiteSpace text_2:\n',tk_WhiteSpace.tokenize(text_2))
print('\nWhiteSpace tweet:\n',tk_WhiteSpace.tokenize(tweet))

WhiteSpace text_1:
 ['That', 'U.S.A.', 'poster-print', 'costs', '$12.40...']

WhiteSpace text_2:
 ['Hope,', 'is', 'the', 'only', 'thing', 'stronger', 'than', 'fear!', 'Hunger', 'Games', '#Hope']

WhiteSpace tweet:
 ["Don't", 'take', 'cryptocurrency', 'advice', 'from', 'people', 'on', 'twitter.', '😅👌']


In [13]:
tk_Tweet = TweetTokenizer()

In [14]:
print('Tweet text_1:\n',tk_Tweet.tokenize(text_1))
print('\nTweet text_2:\n',tk_Tweet.tokenize(text_2))
print('\nTweet tweet:\n',tk_Tweet.tokenize(tweet))

Tweet text_1:
 ['That', 'U', '.', 'S', '.', 'A', '.', 'poster-print', 'costs', '$', '12.40', '...']

Tweet text_2:
 ['Hope', ',', 'is', 'the', 'only', 'thing', 'stronger', 'than', 'fear', '!', 'Hunger', 'Games', '#Hope']

Tweet tweet:
 ["Don't", 'take', 'cryptocurrency', 'advice', 'from', 'people', 'on', 'twitter', '.', '😅', '👌']


In [15]:
tk_MWET = MWETokenizer()
tk_MWET.add_mwe(("Hunger","Games"))
tk_MWET.add_mwe(("$","12.40"))

In [16]:
print('MWET text_1:\n',tk_MWET.tokenize(word_tokenize(text_1)))
print('\nMWET text_2:\n',tk_MWET.tokenize(word_tokenize(text_2)))
print('\nMWET tweet:\n',tk_MWET.tokenize(word_tokenize(tweet)))

MWET text_1:
 ['That', 'U.S.A.', 'poster-print', 'costs', '$_12.40', '...']

MWET text_2:
 ['Hope', ',', 'is', 'the', 'only', 'thing', 'stronger', 'than', 'fear', '!', 'Hunger_Games', '#', 'Hope']

MWET tweet:
 ['Do', "n't", 'take', 'cryptocurrency', 'advice', 'from', 'people', 'on', 'twitter', '.', '😅👌']


### Spacy

In [17]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [18]:
doc = nlp(text_1)
for token in doc:
    print(token.text)

That
U.S.A.
poster
-
print
costs
$
12.40
...


In [19]:
doc = nlp(text_2)
for token in doc:
    print(token.text)

Hope
,
is
the
only
thing
stronger
than
fear
!
Hunger
Games
#
Hope


In [20]:
doc = nlp(tweet)
for token in doc:
    print(token.text)

Do
n't
take
cryptocurrency
advice
from
people
on
twitter
.
😅
👌


In [21]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp_special = English()

In [22]:
tokenizer = Tokenizer(nlp_special.vocab)
tokens = tokenizer(text_1)
print("Blank tokenizer",end=" : ")
for token in tokens:
    print(token, end=', ')

Blank tokenizer : That, U.S.A., poster-print, costs, $12.40..., 

In [23]:
from spacy.symbols import ORTH

In [24]:
special_case_1 = [{ORTH: "Hunger Games"}]
special_case_2 = [{ORTH: "#Hope"}]
nlp.tokenizer.add_special_case("Hunger Games", special_case_1)
nlp.tokenizer.add_special_case("#Hope", special_case_2)
doc = nlp(text_2)

print("\nSpecial case tokenization : ",end=' ')
for token in doc:      # Checking new tokenization
    print(token,end=', ')


Special case tokenization :  Hope, ,, is, the, only, thing, stronger, than, fear, !, Hunger Games, #Hope, 

## Text Normalization

In [25]:
word='Capital'
print('Original: ',word)
print('Transformed: ',word.lower())

Original:  Capital
Transformed:  capital


### Stemming

In [26]:
porter=nltk.PorterStemmer()
lancaster=nltk.LancasterStemmer()
poems=nltk.corpus.gutenberg.words('blake-poems.txt')[50:90]
print('Original: ',poems)
print('Stemmed by porter: ',[porter.stem(t) for t in poems])

Original:  ['Pipe', 'a', 'song', 'about', 'a', 'Lamb', '!"', 'So', 'I', 'piped', 'with', 'merry', 'cheer', '.', '"', 'Piper', ',', 'pipe', 'that', 'song', 'again', ';"', 'So', 'I', 'piped', ':', 'he', 'wept', 'to', 'hear', '.', '"', 'Drop', 'thy', 'pipe', ',', 'thy', 'happy', 'pipe', ';']
Stemmed by porter:  ['pipe', 'a', 'song', 'about', 'a', 'lamb', '!"', 'so', 'i', 'pipe', 'with', 'merri', 'cheer', '.', '"', 'piper', ',', 'pipe', 'that', 'song', 'again', ';"', 'so', 'i', 'pipe', ':', 'he', 'wept', 'to', 'hear', '.', '"', 'drop', 'thi', 'pipe', ',', 'thi', 'happi', 'pipe', ';']


In [27]:
print('Original: ',poems)
print('Stemmed by lancaster: ',[lancaster.stem(t) for t in poems])

Original:  ['Pipe', 'a', 'song', 'about', 'a', 'Lamb', '!"', 'So', 'I', 'piped', 'with', 'merry', 'cheer', '.', '"', 'Piper', ',', 'pipe', 'that', 'song', 'again', ';"', 'So', 'I', 'piped', ':', 'he', 'wept', 'to', 'hear', '.', '"', 'Drop', 'thy', 'pipe', ',', 'thy', 'happy', 'pipe', ';']
Stemmed by lancaster:  ['pip', 'a', 'song', 'about', 'a', 'lamb', '!"', 'so', 'i', 'pip', 'with', 'merry', 'che', '.', '"', 'pip', ',', 'pip', 'that', 'song', 'again', ';"', 'so', 'i', 'pip', ':', 'he', 'wept', 'to', 'hear', '.', '"', 'drop', 'thy', 'pip', ',', 'thy', 'happy', 'pip', ';']


In [28]:
words = ['dogs','having','lower','traditional']
print([porter.stem(w) for w in words])
print([lancaster.stem(w) for w in words])

['dog', 'have', 'lower', 'tradit']
['dog', 'hav', 'low', 'tradit']


In [29]:
wnl=nltk.WordNetLemmatizer()
print('Original: ',poems)
print('Lemmatized: ',[wnl.lemmatize(t) for t in poems])

Original:  ['Pipe', 'a', 'song', 'about', 'a', 'Lamb', '!"', 'So', 'I', 'piped', 'with', 'merry', 'cheer', '.', '"', 'Piper', ',', 'pipe', 'that', 'song', 'again', ';"', 'So', 'I', 'piped', ':', 'he', 'wept', 'to', 'hear', '.', '"', 'Drop', 'thy', 'pipe', ',', 'thy', 'happy', 'pipe', ';']
Lemmatized:  ['Pipe', 'a', 'song', 'about', 'a', 'Lamb', '!"', 'So', 'I', 'piped', 'with', 'merry', 'cheer', '.', '"', 'Piper', ',', 'pipe', 'that', 'song', 'again', ';"', 'So', 'I', 'piped', ':', 'he', 'wept', 'to', 'hear', '.', '"', 'Drop', 'thy', 'pipe', ',', 'thy', 'happy', 'pipe', ';']


In [30]:
print('Original: ',poems)
print('Lemmatized: ',[wnl.lemmatize(t,'v') for t in poems])

Original:  ['Pipe', 'a', 'song', 'about', 'a', 'Lamb', '!"', 'So', 'I', 'piped', 'with', 'merry', 'cheer', '.', '"', 'Piper', ',', 'pipe', 'that', 'song', 'again', ';"', 'So', 'I', 'piped', ':', 'he', 'wept', 'to', 'hear', '.', '"', 'Drop', 'thy', 'pipe', ',', 'thy', 'happy', 'pipe', ';']
Lemmatized:  ['Pipe', 'a', 'song', 'about', 'a', 'Lamb', '!"', 'So', 'I', 'pip', 'with', 'merry', 'cheer', '.', '"', 'Piper', ',', 'pipe', 'that', 'song', 'again', ';"', 'So', 'I', 'pip', ':', 'he', 'weep', 'to', 'hear', '.', '"', 'Drop', 'thy', 'pipe', ',', 'thy', 'happy', 'pipe', ';']


In [31]:
from nltk.corpus import wordnet as wn

In [32]:
print(wn.NOUN,wn.VERB,wn.ADJ,wn.ADV)

n v a r


In [33]:
(wnl.lemmatize('having','v'), wnl.lemmatize('lower','a'))

('have', 'low')

In [34]:
words=['studying','sciences']
print('(["studying","sciences"], "n") --> %s' % [wnl.lemmatize(x, 'n') \
                                                   for x in words])
print('(["studying","sciences"], "v") --> %s' % [wnl.lemmatize(x, 'v') \
                                                   for x in words])
print('(["studying","sciences"], both) --> %s' % \
      [wnl.lemmatize(wnl.lemmatize(x,'n'),'v') \
      for x in words])

(["studying","sciences"], "n") --> ['studying', 'science']
(["studying","sciences"], "v") --> ['study', 'sciences']
(["studying","sciences"], both) --> ['study', 'science']


## Language Model

In [35]:
import string
import random
#nltk.download('punkt')
from nltk.corpus import reuters, stopwords
from nltk import FreqDist, ngrams
from collections import defaultdict, Counter

In [36]:
sents = reuters.sents()

In [37]:
print(sents[:10])



In [38]:
stop_words = set(stopwords.words('english'))
string.punctuation = string.punctuation +'"' + '"' + '-' + '''+''' + '—'
string.punctuation
removal_list = list(stop_words) + list(string.punctuation)+ ['lt','rt']
print(removal_list)

['through', 'under', 'itself', 'can', 'y', 'his', 'further', "didn't", 'some', 'doing', 'above', 'he', 'that', 'am', "don't", 'isn', 'was', "haven't", 'most', 'there', 'of', 'between', 'wouldn', 'those', 'each', 'yourselves', 'shouldn', "wouldn't", 'm', "should've", "shouldn't", 'to', "you're", 'very', 'theirs', 'mightn', 'because', 'd', 'needn', 'weren', 'it', 'until', 'so', 'both', 'not', 've', 'about', 'no', 'himself', 'don', 'are', 'during', 'whom', 'were', 'this', 'own', 'will', 'did', 'now', "hasn't", "isn't", 'against', 'ain', 'only', 'any', 'yourself', 'didn', 'haven', 'what', 'couldn', 'herself', 'its', 'be', 'i', 'having', 'up', 'again', 'on', 'wasn', 'being', 'ourselves', 'before', "you'd", 'in', 'o', 'few', 'too', 'hadn', "mightn't", 'out', 'than', 'him', 'hers', 'is', "couldn't", 'such', 'how', "needn't", "you'll", 'they', 'all', 'myself', 'down', 'does', 'after', 'at', 'from', 'hasn', 'themselves', 'which', 'ours', 'a', 'we', 'then', 'once', 'more', 'their', 'same', 'into

In [39]:
unigram=[]
bigram=[]
trigram=[]
tokenized_text=[]
for sentence in sents:
    sentence = list(map(lambda x:x.lower(),sentence))
    
    # remove stop words and punctuation
    sentence = [word for word in sentence if word not in removal_list]
    tokenized_text.append(sentence)
    unigram.extend(list(ngrams(sentence,1)))
    bigram.extend(list(ngrams(sentence, 2,pad_left=True, pad_right=True)))
    trigram.extend(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))

In [40]:
print("Unigrams: \n",unigram[:10])
print("\nBigrams: \n",bigram[:10])
print("\nTrigrams: \n",trigram[:10])

Unigrams: 
 [('asian',), ('exporters',), ('fear',), ('damage',), ('u',), ('.-',), ('japan',), ('rift',), ('mounting',), ('trade',)]

Bigrams: 
 [(None, 'asian'), ('asian', 'exporters'), ('exporters', 'fear'), ('fear', 'damage'), ('damage', 'u'), ('u', '.-'), ('.-', 'japan'), ('japan', 'rift'), ('rift', 'mounting'), ('mounting', 'trade')]

Trigrams: 
 [(None, None, 'asian'), (None, 'asian', 'exporters'), ('asian', 'exporters', 'fear'), ('exporters', 'fear', 'damage'), ('fear', 'damage', 'u'), ('damage', 'u', '.-'), ('u', '.-', 'japan'), ('.-', 'japan', 'rift'), ('japan', 'rift', 'mounting'), ('rift', 'mounting', 'trade')]


In [41]:
freq_bi = FreqDist(bigram)
freq_tri = FreqDist(trigram)

In [42]:
print('Most common bigrams: \n',freq_bi.most_common(10))
print('\nMost common trigrams: \n',freq_tri.most_common(10))

Most common bigrams: 
 [(('said', None), 7926), (('mln', 'dlrs'), 4401), (('mln', 'vs'), 3921), (('cts', 'vs'), 3311), (('000', 'vs'), 2581), (('cts', 'net'), 2194), ((None, 'said'), 2163), (('vs', 'loss'), 1780), (('billion', 'dlrs'), 1663), (('."', None), 1618)]

Most common trigrams: 
 [(('said', None, None), 7926), ((None, None, 'said'), 2163), (('."', None, None), 1618), (('dlrs', None, None), 1464), ((None, None, 'company'), 1278), (('year', None, None), 1110), ((None, None, 'u'), 1086), (('pct', None, None), 871), (('mln', None, None), 869), (('1986', None, None), 806)]


In [43]:
freq_bi[('asian','exporters')],freq_tri[('asian','exporters','fear')]

(1, 1)

In [44]:
def next_word_probability(bi_g,w):
    tri_g = bi_g + w
    tri_g_freq, bi_g_freq = freq_tri[tri_g], freq_bi[bi_g]
    prob_w = tri_g_freq / bi_g_freq if bi_g_freq > 0 else 0
    return {w: prob_w}

In [45]:
print(next_word_probability(('asian', 'exporters'),('fear',)))
print(next_word_probability(('asian', 'exporters'),('from',)))
print(next_word_probability(('japan', 'rift'),('mounting',)))

{('fear',): 1.0}
{('from',): 0.0}
{('mounting',): 1.0}


In [46]:
def predict_next_word(context):
    all_scores = {}
    for word in unigram:
        word_score = next_word_probability(context,word)
        all_scores.update(word_score)
    item_max_value = max(all_scores.items(), key=lambda x: x[1])
    list_of_keys = list()
    # Iterate over all the items in dictionary to find keys with max value
    for key, value in all_scores.items():
        if value == item_max_value[1]:
            list_of_keys.append(key)
    return list_of_keys, item_max_value[1]

In [47]:
likely_words, probability = predict_next_word(('japan','rift'))
print('Predicted words: \n',likely_words)
print('\nLikelihood: ',probability)

Predicted words: 
 [('mounting',)]

Likelihood:  1.0


In [48]:
import random

def continue_context(context,n_steps):
    context_len=len(context)
    assert context_len>=2, f"context longer than 1 expected, got: {context_len}"
    print(context)
    for i in range(n_steps):
        short_context = context[-2:]
        words,likelihood=predict_next_word(short_context)
        if len(words)>1: 
            next_word = random.choice(words)
        else:
            next_word = words[0]
        context += next_word
    return context

In [49]:
generated = continue_context(('asian', 'exporters'), 20)

('asian', 'exporters')


In [50]:
print(generated)

('asian', 'exporters', 'fear', 'china', 'may', 'lack', 'supplies', 'many', 'farmers', 'south', 'put', 'soybean', 'land', 'accounted', 'large', 'portion', 'positive', 'effects', 'growth', 'policy', 'statement', 'commissioner')


## Hands On

Download the data from kaggle. 
https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews

You could also find a dataset you are interested in from the list on this page:
https://github.com/niderhoff/nlp-datasets

In [51]:
# TODO: Load in the data
import pandas as pd
df = pd.read_csv("/Users/timowang/Downloads/archive/Reviews.csv")
texts = df["Text"].to_list()
sents = [word_tokenize(text.lower()) for text in texts]

In [52]:
# TODO: Normalize the text; Tokenize the texts into unigrams, bigrams and trigrams
unigram=[]
bigram=[]
trigram=[]
tokenized_text=[]
for sentence in tqdm(sents):
    sentence = list(map(lambda x:x.lower(),sentence))
    
    # remove stop words and punctuation containing non alphanum chars and br
    sentence = [word for word in sentence if word not in removal_list and word.isalnum() and word != "br"]
    tokenized_text.append(sentence)
    unigram.extend(list(ngrams(sentence,1)))
    bigram.extend(list(ngrams(sentence, 2,pad_left=True, pad_right=True)))
    trigram.extend(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))

100%|█████████████████████████████████████████████████████████████████| 568454/568454 [02:16<00:00, 4151.83it/s]


In [53]:
# TODO: Compute frequencies of bigrams and trigrams
freq_bi = FreqDist(bigram)
freq_tri = FreqDist(trigram)

In [54]:
print('Most common bigrams: \n',freq_bi.most_common(10))
print('\nMost common trigrams: \n',freq_tri.most_common(10))

Most common bigrams: 
 [((None, 'love'), 27889), ((None, 'great'), 15349), ((None, 'bought'), 15277), (('taste', 'like'), 14828), (('highly', 'recommend'), 14501), (('peanut', 'butter'), 13679), (('dog', 'food'), 12916), (('green', 'tea'), 12852), (('grocery', 'store'), 12368), (('tastes', 'like'), 11585)]

Most common trigrams: 
 [((None, None, 'love'), 27889), ((None, None, 'great'), 15349), ((None, None, 'bought'), 15277), (('product', None, None), 11257), ((None, None, 'tried'), 10106), ((None, None, 'like'), 10053), ((None, None, 'first'), 8930), ((None, None, 'really'), 8788), ((None, None, 'product'), 8694), (('buy', None, None), 8461)]


In [55]:
# TODO: Compute the most likely words for the following bigrams: white wine, None steak, red white

likely_words, probability = predict_next_word(('white','wine'))
print(f'Predicted words: {likely_words} \n')
print(f'Likelihood: {probability} \n')

likely_words, probability = predict_next_word((None,'steak'))
print(f'Predicted words: {likely_words[:20]} \n')
print(f'Likelihood: {probability} \n')

likely_words, probability = predict_next_word(('red','wine'))
print(f'Predicted words: {likely_words[:20]} \n')
print(f'Likelihood: {probability} \n')

Predicted words: [('also',)] 

Likelihood: 0.291497975708502 

Predicted words: [('sauce',), ('mushrooms',), ('rub',)] 

Likelihood: 0.2222222222222222 

Predicted words: [('vinegar',)] 

Likelihood: 0.23741007194244604 



In [56]:
# TODO: Generate texts of length 20 for the following contexts: white wine, None steak, red white
generated = continue_context(('white', 'wine'), 20)
print(generated)

generated = continue_context(('red', 'wine'), 20)
print(generated)

generated = continue_context((None, 'steak'), 20)
print(generated)

('white', 'wine')
('white', 'wine', 'also', 'moderate', 'amount', 'acid', 'subdue', 'strong', 'flavors', 'ginger', 'lime', 'garlic', 'cilantro', 'without', 'classically', 'riesling', 'fits', 'parameters', 'perfectly', 'also', 'recommend', 'http')
('red', 'wine')
('red', 'wine', 'vinegar', 'help', 'ended', 'sitting', 'refrigerator', '6', 'months', 'ago', 'started', 'feeding', 'cat', 'food', 'protein', 'followed', 'carbohydrate', 'first', 'two', 'ingredients', 'science', 'diet')
(None, 'steak')
(None, 'steak', 'rub', 'answered', 'quickly', 'said', 'sending', 'bag', 'smaller', 'expected', 'price', 'also', 'great', 'price', 'amazon', 'much', 'better', 'price', 'amazon', 'much', 'better', 'price')
