## Import and Clean DataSet

In [None]:
import pandas as pd
df = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.csv.gz', compression='gzip')

In [None]:
df.info()

In [None]:
df.category.value_counts()

### Clean Unwanted Text Patterns

In [None]:
import re
 
# html tags
df['text'] = df.text.apply(lambda t : re.sub("<[^>]*>",' ', t))
# line returns
df['text'] = df.text.apply(lambda t : re.sub("[\r\n]+",' ', t))
# urls
df['text'] = df.text.apply(lambda t : re.sub("http\S+",' ', t))
# mentions
df['text'] = df.text.apply(lambda t : re.sub("@\S+",' ', t))
# latex
df['text'] = df.text.apply(lambda t : re.sub("\$[^>]*\$",' ', t))
# digits
df['text'] = df.text.apply(lambda t : re.sub("\d+",' ', t))
# rm some of the punctuation but keep ,.!? and -
punctuation = '"#$%&()*+/:;<=>@[\\]^_`{|}~”“'
pattern = r"[{}]".format(punctuation)
df['text'] = df.text.apply(lambda t : re.sub(pattern,' ', t))
# multiple spaces
df['text'] = df.text.apply(lambda t : re.sub("\s\s+",' ', t))
# trailing spaces
df['text'] = df.text.apply(lambda t : t.strip())

### Tokenize Datasets

In [None]:
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
df['tokens'] = df.text.apply(lambda t : tokenizer.tokenize(t.lower()))

# Add number of tokens
df['n_tokens'] = df.tokens.apply(len)

# Remove texts that are too long or too short
cleanedTokenizedData = df[(df.n_tokens > 4) & (df.n_tokens < 5000)]
cleanedTokenizedData.head()

In [None]:
df.sample(5).tokens.values

## Load Tokenized Dataset

In [None]:
# Load tokenized dataset that's space separated
tokenizedDf = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.tokenized.csv.gz', compression='gzip')
print(df.head())
# Get list of tokens
tokenizedDf['tokens'] = tokenizedDf.tokens.apply(lambda txt : txt.split())

In [None]:
tokenizedDf.sample(5).tokens.values

## Prepare Testing and Training DataSets

In [None]:
testingDf = cleanedTokenizedData[cleanedTokenizedData.category == 'title'].copy()

In [None]:
trainingDf = cleanedTokenizedData[(cleanedTokenizedData.category == 'post') | (cleanedTokenizedData.category == 'comment')].copy()

In [None]:
print("-- Training set: {}\n".format(trainingDf.shape))
print(trainingText.head())

print("\n-- Testing set {}\n".format(testingDf.shape))
print(testingText.head())


### Generate Ngrams

In [None]:
from nltk.util import ngrams, trigrams


n = 3 # Trigrams

exampleNgrams = ngrams(trainingText.iloc[0].tokens, n, pad_left = True, pad_right = True, left_pad_symbol = "<s>", right_pad_symbol="</s>")
exampleTrigrams = trigrams(trainingText.iloc[0], pad_left = True, pad_right = True, left_pad_symbol = "<s>", right_pad_symbol="</s>")

paddedNGrams = ngrams(trainingText.tokens.values, n, pad_left = True, pad_right = True, left_pad_symbol = "<s>", right_pad_symbol="</s>")

# list(paddedNGrams)

list(exampleNgrams)


### Generate Prefix/Frequency Matrix

In [None]:
from nltk.util import ngrams
from collections import defaultdict, Counter

counts = defaultdict(Counter)
n = 3 # Trigrams

for tokens in trainingDf.tokens.values:
    for ngram in ngrams(
            tokens,
            n,
            pad_left = True,
            pad_right = True,
            left_pad_symbol = "<s>",
            right_pad_symbol="</s>"):
        
        prefix = ngram[:n-1]
        token = ngram[n-1]
        counts[prefix][token] +=1

In [None]:
print("we have {} bigrams".format(len(counts.keys())))

In [None]:
import random
for i in range(5):
    prefix = random.choice(list(counts.keys()))
    print("{}: \t{}".format(prefix,counts[prefix]))

## Token / Prefix Probabilities

$$p(token / prefix) = \frac{count(prefix + token)} {count(prefix)}$$

In [None]:
# probabilities
frequencies = defaultdict(dict)

for prefix, tokens in counts.items():
    total_count = sum(tokens.values())
    for token, count in tokens.items():
        frequencies[prefix][token] = count / total_count

In [None]:
for i in range(5):
    prefix = random.choice(list(frequencies.keys()))
    print("{}: \t{}".format(prefix,frequencies[prefix]))

## Text Generation

- takes a bigram (must exist in corpus) as input
- generates a new token by sampling the available tokens related to the bigram using the frequency object as distribution
- slides the bigram to include the new token
- generates a new token based on the new bigram
- stops when the text is N tokens long or the latest token is the end of string symbol

In [None]:
import numpy as np

def generate(text, n_tokens = 20):
    for i in range(n_tokens):
        prefix = tuple(text.split()[-n + 1:]) # n = 3 since we're looking at trigrams
        if len(frequencies[prefix]) == 0: # next word is not loaded in the frequency dictionary
            break
        candidates = list(frequencies[prefix].keys()) # find tokens that could follow that prefix
        probabilities = list(frequencies[prefix].values()) # find probabilities of tokens that can follow prefix
        text += ' ' + np.random.choice(candidates, p = probabilities)
        if text.endswith('</s>'):
            break

    return text

In [None]:
text = 'the model'
print()
print(generate(text))

print()
text = 'that distribution'
print(generate(text))

print()
text = 'to determine'
print(generate(text))

## Building an n-gram language model using NLTK
https://www.nltk.org/api/nltk.lm.html#module-nltk.lm

In [6]:
import nltk 
import pandas as pd

assert(nltk.__version__ >= '3.4') # requires v >= 3.4

In [20]:
# Load data into pandas dataframe, shuffle it and reset the index
df = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.tokenized.csv.gz', compression='gzip').sample(frac = 1, random_state = 8).reset_index(drop = True)

df['tokens'] = df.tokens.apply(lambda txt : txt.split())

# Tokenize and divide into training and testing datasetsdf['tokens'] = df.tokens.apply(lambda txt : txt.split())
df_train = df[df.category.isin(['post','comment'])].copy()
df_test = df[df.category.isin(['title'])].copy()

In [21]:
# Prepare the training data
from nltk.lm.preprocessing import padded_everygram_pipeline 
from nltk.lm import MLE
from nltk.lm import Vocabulary
from nltk.util import ngrams


ngrams_degree = 3

# train_data = [
#     ngrams(t, n= ngrams_degree,
#         pad_right=True, pad_left=True,
#         left_pad_symbol="<s>", right_pad_symbol="</s>")
#     for t in df_train.tokens.values]

# words = [word for sent in df_train.tokens.values for word in sent]
# words.extend(["<s>", "</s>"])
train, vocab = padded_everygram_pipeline(ngrams_degree, df_train.tokens)
# vocab = Vocabulary(words, unk_cutoff = 20)

In [None]:
# define the model
model = MLE(ngrams_degree)
# print(len(model.vocab))

# fit the model
model.fit(train, vocab)
# print(len(model.vocab))


### Out of Vocabulary (OOV) - Laplace smoothing
The vocabulary helps us handle words that have not occurred during training
To remediate to that problem we can artificially assign a probability (although a very low one) to missing ngrams and tokens.

In [None]:
print(model.vocab.lookup(["aliens", "from", "Mars"]))
print(model.counts)

### Scoring a Model
Indicates how probable words are in certain contexts. This being MLE, the model returns the item’s relative frequency as its score.

In [None]:
# For example we want to know what is the chance that “has” is preceded by “the model”.
model.score("has", ["the", "model"])

### Perplexity
- Measures the quality of the model.

- The idea is to estimate the probability of a test sentence given the model, so an uncommon sentence should be less probable than a common one.


In [None]:
test = the difference between the two approaches is discussed here.
model.perplexity(test)
test = the difference between the two approaches is discussed here
model.perplexity(test)
test = the difference between the two approaches
model.perplexity(test)

In [None]:
# Generate text
model.generate(5, random_seed=3)