## Import and Clean DataSet

In [None]:
import pandas as pd
df = pd.read_csv('https://alexip-ml.s3.amazonaws.com/stackexchange_812k.csv.gz', compression='gzip')

In [None]:
df.info()

In [None]:
df.category.value_counts()

### Clean Unwanted Text Patterns

In [None]:
import re
 
# html tags
df['text'] = df.text.apply(lambda t : re.sub("<[^>]*>",' ', t))
# line returns
df['text'] = df.text.apply(lambda t : re.sub("[\r\n]+",' ', t))
# urls
df['text'] = df.text.apply(lambda t : re.sub("http\S+",' ', t))
# mentions
df['text'] = df.text.apply(lambda t : re.sub("@\S+",' ', t))
# latex
df['text'] = df.text.apply(lambda t : re.sub("\$[^>]*\$",' ', t))
# digits
df['text'] = df.text.apply(lambda t : re.sub("\d+",' ', t))
# rm some of the punctuation but keep ,.!? and -
punctuation = '"#$%&()*+/:;<=>@[\\]^_`{|}~”“'
pattern = r"[{}]".format(punctuation)
df['text'] = df.text.apply(lambda t : re.sub(pattern,' ', t))
# multiple spaces
df['text'] = df.text.apply(lambda t : re.sub("\s\s+",' ', t))
# trailing spaces
df['text'] = df.text.apply(lambda t : t.strip())

### Tokenize Datasets

In [None]:
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
df['tokens'] = df.text.apply(lambda t : tokenizer.tokenize(t.lower()))

# Add number of tokens
df['n_tokens'] = df.tokens.apply(len)

# Remove texts that are too long or too short
cleanedTokenizedData = df[(df.n_tokens > 4) & (df.n_tokens < 5000)]
cleanedTokenizedData.head()

### Prepare Testing and Training DataSets

In [None]:
testingText = cleanedTokenizedData[cleanedTokenizedData.category == 'title'].tokens

In [None]:
trainingText = cleanedTokenizedData[(cleanedTokenizedData.category == 'post') | (cleanedTokenizedData.category == 'comment')].tokens

### Generate Ngrams

In [None]:
from nltk.util import ngrams, trigrams


n = 3 # Trigrams

exampleNgrams = ngrams(trainingText.iloc[0], n, pad_left = True, pad_right = True, left_pad_symbol = "<s>", right_pad_symbol="</s>")
exampleTrigrams = trigrams(trainingText.iloc[0], pad_left = True, pad_right = True, left_pad_symbol = "<s>", right_pad_symbol="</s>")

paddedNGrams = ngrams(trainingText, n, pad_left = True, pad_right = True, left_pad_symbol = "<s>", right_pad_symbol="</s>")

# list(paddedNGrams)

# list(exampleTrigrams)


### Generate Prefix/Frequency Matrix

In [None]:
from collections import defaultdict, Counter

counts = defaultdict(Counter)
model = defaultdict(lambda: defaultdict(lambda: 0))
# list(exampleTrigrams)
for w1, w2, w3 in paddedNGrams:
   model[(w1, w2)][w3] += 1 


In [None]:
dict(model)

In [None]:
# probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [None]:
dict(model["when", "fitting"])