# Train a simple ngram language model
From the twitter sentiment based data (see [step 0](sentiment140_step0_parse_data.ipynb))

In [2]:
import pandas as pd
path = "sentiment140_train.csv"
df = pd.read_csv(path)
texts = df["text"].values

In [3]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()

In [4]:
corpus = [tt.tokenize(text) for text in texts]

In [5]:
len(corpus)  # 1.3 million tweets after filtering!

1305022

In [6]:
from nltk.lm.preprocessing import pad_both_ends
list(pad_both_ends(corpus[0], n=2))

['<s>',
 'awww',
 "that's",
 'a',
 'bummer',
 'you',
 'shoulda',
 'got',
 'david',
 'carr',
 'of',
 'third',
 'day',
 'to',
 'do',
 'it',
 '</s>']

In [7]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(corpus[0], n=2))
list(everygrams(padded_bigrams, max_len=3))


[('<s>',),
 ('<s>', 'awww'),
 ('<s>', 'awww', "that's"),
 ('awww',),
 ('awww', "that's"),
 ('awww', "that's", 'a'),
 ("that's",),
 ("that's", 'a'),
 ("that's", 'a', 'bummer'),
 ('a',),
 ('a', 'bummer'),
 ('a', 'bummer', 'you'),
 ('bummer',),
 ('bummer', 'you'),
 ('bummer', 'you', 'shoulda'),
 ('you',),
 ('you', 'shoulda'),
 ('you', 'shoulda', 'got'),
 ('shoulda',),
 ('shoulda', 'got'),
 ('shoulda', 'got', 'david'),
 ('got',),
 ('got', 'david'),
 ('got', 'david', 'carr'),
 ('david',),
 ('david', 'carr'),
 ('david', 'carr', 'of'),
 ('carr',),
 ('carr', 'of'),
 ('carr', 'of', 'third'),
 ('of',),
 ('of', 'third'),
 ('of', 'third', 'day'),
 ('third',),
 ('third', 'day'),
 ('third', 'day', 'to'),
 ('day',),
 ('day', 'to'),
 ('day', 'to', 'do'),
 ('to',),
 ('to', 'do'),
 ('to', 'do', 'it'),
 ('do',),
 ('do', 'it'),
 ('do', 'it', '</s>'),
 ('it',),
 ('it', '</s>'),
 ('</s>',)]

In [8]:
from nltk.lm.preprocessing import padded_everygram_pipeline
max_n = 3
train, vocab = padded_everygram_pipeline(max_n, corpus)

In [9]:
from nltk.lm import MLE, Laplace
lm = MLE(max_n)

In [10]:
lm.fit(train, vocab)

In [11]:
print(lm.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 251406 items>


In [12]:
lm.vocab.lookup(["studying", "in", "trondheim"])

('studying', 'in', 'trondheim')

In [13]:
lm.vocab.lookup(["studying", "in", "gjøvik"])

('studying', 'in', '<UNK>')

In [14]:
lm.counts["hey"]

16148

In [15]:
lm.counts[['hey']]

FreqDist({'i': 1253, 'you': 534, 'there': 439, 'hey': 367, 'guys': 343, "i'm": 309, 'thanks': 295, 'how': 274, 'girl': 217, 'just': 175, ...})

In [16]:
# prob for "you" preceded by "are", i.e. "are you"
lm.score("you", ["are"])

0.14400196254804154

In [17]:
test = [("hey", "you"), ("where", "are")]
print(lm.entropy(test))
print(lm.perplexity(test))

4.193230105130112
18.293130828360948


In [34]:
lm.counts[["what's"]]

FreqDist({'up': 681, 'the': 531, 'wrong': 492, 'going': 381, 'your': 266, 'happening': 126, 'that': 119, 'with': 117, 'good': 89, 'a': 83, ...})

In [41]:
text = "i'm going to"

def pred(text, n=2):
    tokens = text.split()
    pred = lm.generate(n, text_seed=tokens)
    return " ".join(w for w in pred if ">" not in w)

prediction = pred(text, n=3)
print(f"{text} [{prediction}]")

i'm going to [travel through germany]
