# Using NLTK for NGram Models

DS 5001 Text as Data

**Purpose**:  

Demonstrate the tools provide by NLTK to create ngram model.s

# Set Up

In [1]:
import pandas as pd
import numpy as np

In [2]:
import nltk
from nltk.lm import MLE
from nltk.lm import Vocabulary
from nltk.lm import NgramCounter
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import Counter

# Configs

In [3]:
import configparser

config = configparser.ConfigParser()
config.read("../../../env.ini")
data_dir = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

# Pipeline

In [4]:
ngram_order = 3 # 3 means trigrams
pads = ["<s>", "</s>"]
widx = [f"w{i}" for i in range(ngram_order)]
ohco = ['sent_num', 'token_num']

In [5]:
widx

['w0', 'w1', 'w2']

In [None]:
train_file = f"{output_dir}/austen-combo-TOKENS.csv"
train_ohco = "book_id  chap_num  para_num  sent_num  token_num".split()
train_df = pd.read_csv(train_file).set_index(train_ohco)

In [16]:
train_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0,0,0,Sir,sir
1,1,0,0,1,Walter,walter
1,1,0,0,2,Elliot,elliot
1,1,0,0,3,of,of
1,1,0,0,4,Kellynch,kellynch


In [17]:
test_file = f"{data_dir}/misc/test_sentences.txt"

## Convert tokens to lists of sentence tokens

In [35]:
train_tokens = train_df.groupby(train_ohco[:4]).term_str.apply(list).values.tolist()

In [36]:
train_tokens[5]

['this',
 'was',
 'the',
 'page',
 'at',
 'which',
 'the',
 'favourite',
 'volume',
 'always',
 'opened']

Here we convert each sentence into a ngrams using `nltk.ngrams()`

In [47]:
# 2D list to hold our resulting ngrams
train_ngrams = [[] for i in range(ngram_order)] 

# Arguments to pass
ngram_args = dict(
    pad_right=True, 
    pad_left=True, 
    left_pad_symbol=pads[0], 
    right_pad_symbol=pads[1]
)

# Apply the function to each sentence
for j in range(ngram_order):
    train_ngrams[j] = [nltk.ngrams(sent, n=j+1, **ngram_args) for sent in train_tokens]

Now put into a data frame

In [None]:
ng_cols = ['sent_num', 'token_num', 'word_pos', 'token']
ng_data = [[] for n in range(ngram_order)] # Temporary list of lists to hold tokens
ng_df = [None for n in range(ngram_order)] # For list of data frames

In [None]:
for n in range(ngram_order):
    for i, z in enumerate(train_ngrams[n]):
        for j, x in enumerate(list(z)):
            for k, token in enumerate(list(x)):
                ng_data[n].append((i, j, f"w{k}", token))
    ng_df[n] = pd.DataFrame(ng_data[n], columns=ng_cols).set_index(ng_cols[:-1]).unstack()
    ng_df[n].columns = ng_df[n].columns.droplevel(0)

In [49]:
ng_df[2].loc[5]

word_pos,w0,w1,w2
token_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,<s>,<s>,this
1,<s>,this,was
2,this,was,the
3,was,the,page
4,the,page,at
5,page,at,which
6,at,which,the
7,which,the,favourite
8,the,favourite,volume
9,favourite,volume,always


In [11]:
ng_counts = []
for n in range(ngram_order):
    # ng_counts.append(ng_df[n].value_counts().to_frame('n'))
    
    df = ng_df[n].value_counts().to_frame('n')
    if n > 0:
        df1 = df.n.unstack(fill_value=0)
        df2 = (df1.T / df1.T.sum()).T
        ng_counts.append(df2)
    else:
        ng_counts.append(df)

In [12]:
ng_counts[2]

Unnamed: 0_level_0,w2,1,15,16,1760,1784,1785,1787,1789,1791,1800,...,your,yours,yourself,yourselves,youth,youthful,z,zeal,zealous,zealously
w0,w1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ends,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,1784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,1810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zealous,attention,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zealous,officer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zealous,on,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zealously,active,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
ng_counts[1].loc['she'].sort_values(ascending=False).head(10)

w1
had      0.148967
was      0.135194
could    0.079739
would    0.040594
is       0.024647
did      0.021385
felt     0.018123
might    0.016673
saw      0.015948
must     0.015585
Name: she, dtype: float64

In [14]:
ng_counts[2].loc[('he','had')].sort_values(ascending=False).head(10)

w2
been     0.166113
not      0.066445
no       0.039867
a        0.026578
just     0.023256
the      0.019934
never    0.019934
left     0.019934
done     0.019934
seen     0.019934
Name: (he, had), dtype: float64