# Language Modeling

In [None]:
import numpy as np
import os

from nltk.util import pad_sequence, bigrams, ngrams, everygrams
from nltk.lm.preprocessing import pad_both_ends, flatten, padded_everygram_pipeline
from nltk import word_tokenize
from nltk.lm import MLE, KneserNeyInterpolated
from nltk import FreqDist
import itertools
import nltk
from sklearn.neural_network import BernoulliRBM

## Data Preprocessing

In [None]:
# Get current directories
directory = os.getcwd()

# Get all the files 
filepath = os.path.join(directory, 'Gutenberg/txt/')
files = os.listdir(filepath)

In [None]:
# Make a dict for authors and titles
titles = dict()
text_files = dict()

# Check every file
for file in files:
    # Split the author and title
    split = file.split('___')
    try:
        author = split[0]
        title = split[1].split('.')[0]
    except:
        # Not a valid title file
        pass
    if author not in titles:
        titles[author] = []
        text_files[author] = []
        
    text_files[author].append(file)
    titles[author].append(title)

In [None]:
# Read in all the books from an author
# Each book is considered a document now. 
documents = []
books = text_files['Nathaniel Hawthorne']
for book in books[0:3]:
    file = os.path.join(filepath, book)
    with open(file) as f:
        data = f.read()
        data = data.replace("[^a-zA-Z#]", "")
        data = data.lower()
        documents.append(data)

In [None]:
# Split into sentences (tokenize)
# This is similar to nltk.sent_tokenize(data) ~I think...
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# sentences = tokenizer.tokenize(data)

tokenized = []
from gensim.utils import simple_preprocess
for doc in documents:
    tokenized.append(simple_preprocess(doc))

## n-gram

In [None]:
# Tri-gram (3 words)
n = 2
train_data, padded_sents = padded_everygram_pipeline(n, tokenized)


In [None]:
# Train tri-gram model
# Use MLE
model = MLE(n)
model.fit(train_data, padded_sents)
len(model.vocab)

In [None]:
#Printing out some testing result

# print(model.vocab.lookup(tokenized[0]))
# model.counts['woman'] # It can count there is 43 instances of 'woman'
# model.counts[['a']]['woman'] # a woman is 19 instances (Count woman | a)
model.vocab.lookup(tokenized[0])

In [None]:
import nltk

print("... build")
brown = nltk.corpus.brown
corpus = [word.lower() for word in brown.words()]

# Train on 95% f the corpus and test on the rest
spl = round(95*len(corpus)/100)
print(spl)
train = corpus[:spl]
test = corpus[spl:]

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.items())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

print("... train")
# from nltk.model import NgramModel
# from nltk.probability import LidstoneProbDist

print(train.sents)
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
lm = NgramModel(5, train, estimator=estimator)
# model = MLE(5)
# model.fit(train, )

print ("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % ( len(corpus), len(vocabulary), len(train), len(test) ))
print ("perplexity(test) =", lm.perplexity(test))

In [None]:
nltk.download('brown')

## RBM

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [None]:
# Load dataset here

# Generate matrix for RBM

# Convert dataset into a matrix then FloatTensor
# Create a document-word matrix
docs = list(range(len(documents)))
words = []
for token in tokenized:
    for word in token:
        if word not in words:
            words.append(token)
doc_word = np.zeros((len(docs), len(words)), dtype=int)
print(doc_word.shape)
row, col = doc_word.shape
for i in range(row):
    for j in range(col):
        if words[j] in tokenized[i]:
            doc_word[i][j] = 1
            
training_set = doc_word

In [None]:
# # Load dataset here

# # Generate matrix for RBM

# # Convert dataset into a matrix then FloatTensor
# # Create a document-word matrix
# training_set = np.load('hawthorne.dat')
# training_set = training_set[:5]
# # Remove all columns of 0s

# filt = np.where(training_set.sum(axis=0)==0)
# training_set = np.delete(training_set, filt, 1)

# print((training_set).shape)
# training_set

In [None]:
# RBM Architecture
class RBM():
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv)
        self.a = torch.randn(1, nh)
        self.b = torch.randn(1, nv)

    def sample_h(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)

    def sample_v(self, y):
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)

    def train(self, v0, vk, ph0, phk):
        self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)
        
    def perplexity(self, R, vk, hk):
        R = R.numpy()
        vk = vk.numpy()
        hk = hk.numpy()
        w = self.W.numpy()
        b = self.b.numpy()
        
        return np.exp((hk.T*w.T + b.T)*R.T*vk + b.T*vk)

In [None]:
# Number of visible node
nv = len(training_set[0])
# Number of hidden node
nh = 7451
# Batch size
batch_size = 2
# Initialize the model
rbm = RBM(nv, nh)

In [None]:
# Training the model
nb_users = len(tokenized)
tokenized = torch.FloatTensor(training_set)
print (nb_users)
nb_epoch = 7
x = []
y_loss = []
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(0, nb_users-batch_size, batch_size):
        vk = tokenized[id_user:id_user+batch_size]
        v0 = tokenized[id_user:id_user+batch_size]
        ph0,_ = rbm.sample_h(v0)
        for k in range(10):
            _,hk = rbm.sample_h(vk)
            _,vk = rbm.sample_v(hk)
            vk[v0<0] = v0[v0<0]
        phk,_ = rbm.sample_h(vk)
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))
        s += 1.
        print(rbm.perplexity(tokenized, vk, hk))
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))
    x.append(epoch)
    y_loss.append(train_loss/s)

In [None]:
import matplotlib.pyplot as plt
plt.title('Loss function over epochs')
plt.ylabel('L1 Loss')
plt.xlabel('Epoch')
plt.plot(x, y_loss)
plt.show()
plt.savefig('rbm_loss.png')

In [None]:
test = books[-1]
file = os.path.join(filepath, book)
with open(file) as f:
    data = f.read()
    data = data.replace("[^a-zA-Z#]", "")
    data = data.lower()

test_tokenized = []
# Word tokenization
test_tokenized.append(word_tokenize(data))

row, col = training_set.shape
test_set = np.zeros((1, col), dtype=int)

row, col = test_set.shape
for i in range(row):
    for j in range(col):
        if words[j] in test_tokenized[i]:
            test_tokenized[i][j] = 1
test_tokenized = torch.FloatTensor(test_set)
vk = tokenized[:batch_size]
v0 = tokenized[:batch_size]
ph0,_ = rbm.sample_h(v0)

for k in range(10):
    _,hk = rbm.sample_h(vk)
    _,vk = rbm.sample_v(hk)
    vk[v0<0] = v0[v0<0]
rbm.perplexity(test_tokenized, vk, hk)

## Log Bilinear Language Model

Look at lplmodel.py file instead