In [1]:
import numpy as np
from numpy import random
import re
import requests
from bs4 import BeautifulSoup
import json
import csv
import csv
import os
import pandas as pd
import operator
import openpyxl
import xlrd
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import xlsxwriter
import urllib.request
from nltk.tokenize.regexp import (
    RegexpTokenizer,
    WhitespaceTokenizer,
    BlanklineTokenizer,
    WordPunctTokenizer,
    wordpunct_tokenize,
    regexp_tokenize,
    blankline_tokenize,
)

from nltk.util import ngrams, pad_sequence
from nltk.tokenize import TreebankWordTokenizer
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./data/nlp_finalproj_data_withids.csv').iloc[:,0:11]

In [3]:

def get_text(df,cuisine):
    
    df.drop_duplicates(subset = ['r_id'],keep = 'first',inplace = True)
    df = df[df['cuisine']==cuisine]
    docs = [str(instruction).replace('\n','').replace('\xa0','') for instruction in df['instructions']]
    text = ''.join(docs)
    
    return text

### create N-gram Model

In [4]:
class NgramModel():
    def __init__(self, n):
        """
        n is the order of the ngram model
        """
        self.n = n
        self.nm = n-1
        
        self.ngrams = []
        self.tokens = []     
        self.nmgrams = []
        
        # Initialize empty vocab
        self.vocab = set([])
        
    def update(self, text): 
        """
        computes the n-grams for the input text and updates
        the internal information. The input text is padded
        with '.' as the prefix
        """
        tokens = WordPunctTokenizer().tokenize(text.lower()) 
        
        ngram = ngrams(tokens,self.n,pad_left=True, pad_right=False, left_pad_symbol='.')
        nmgram = ngrams(tokens,self.n -1,pad_left=True, pad_right=False, left_pad_symbol='.')
        
        self.ngrams.extend(ngram)
        self.nmgrams.extend(nmgram)
        self.tokens.extend(tokens)
        self.vocab = set(self.tokens)
    
    def get_vocab(self):
        """
        get all vocab used by this model
        """
        return self.vocab
    
    def size_vocab(self):
        """
        reture size of the vocab
        
        """
        # doesn't include padding
        return len(self.vocab)-self.n+1
               
    def prob(self,context,word):
        """
        accepts an (n-1)-length word string representing
        a context,a word and returns the probability of that
        word occurring, given the preceding context. it address
        unseen problem
        """   
        context = tuple(WordPunctTokenizer().tokenize(context.lower()))
        word = word.lower()
        vocal = self.get_vocab()
        vocal_size = self.size_vocab()
        # unigram 
        if self.n == 1:
            word_count_in_tokens = self.tokens.count(word)
            # if new word
            if word_count_in_tokens == 0:        
                prob = 1/(vocal_size + 1)
            else:
                prob = (word_count_in_tokens + 1) / (len(self.ngrams) + vocal_size )  
        #n-gram for n > 1
        else: 
            # if context is new
            if context not in self.nmgrams:           
                prob = 1/ vocal_size
            # if context is old, but word is new
            elif word not in vocal: 
                prob = 1/(vocal_size + 1)
            # context is old and word is old
            else: 
                temp = list(context)
                temp.append(word)  
                temp2 = list(temp)
                context_word = tuple(temp2)     
                prob = (self.ngrams.count(context_word)+1)/(self.nmgrams.count(context)+ vocal_size)
                 
        return prob
        
    def len_text(self):
        return len(self.tokens)
    
    def len_ngram(self):
        return len(self.ngrams)
    
    def word_freq(self, word):
        cnt = 0
        vocal_size = self.size_vocab()
        for token in self.tokens:
            if word == token:
                cnt = cnt + 1
        freq = (cnt +1)/(len(self.tokens) + vocal_size)
        
        if word not in set(self.tokens):
            freq = 1/(1 + vocal_size)
        
        return freq
        
    def ngram_freq(self, gram):
        cnt = 0
        vocal_size = self.size_vocab()
        for ngram in self.ngrams:
            if gram == ngram:
                cnt = cnt + 1
        freq = (cnt + 1)/(len(self.ngrams) + vocal_size)
        
        if gram not in set(self.ngrams):
           
            freq = 1/vocal_size
        return freq
    
    def generate_text(self, context, min_length, max_length):
        """
        This function utlize n-gram model to generate sentences in a way that the probability of each n-gram
        is according to the n-gram frequency in the n-gram model
        
        """
        random.seed(42)
         
        sentence_tokens = WordPunctTokenizer().tokenize(context.lower()) # tokens
        sentence_length = len(sentence_tokens)
        
        while (sentence_length <= max_length): 
            if sentence_length < self.n-1:
                selected_word = random.choice(self.tokens)
            else:
                # ngram tokens
                cur_context = sentence_tokens[-self.n+1:]
  
                # convert to string
                cur_context_str = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in cur_context]).strip()

                freq_table = {}
                for word in self.get_vocab():
                    frequency = self.prob(cur_context_str, word)
                    freq_table[word] = frequency

                sorted_freq_table =  sorted(freq_table.items(), key=operator.itemgetter(1, 0), reverse=True)
               # check--in case the generator gets trapped in a loop
                selected_word = None
                for word_with_freq in sorted_freq_table:
                    if word_with_freq[0] not in cur_context:
                        selected_word = word_with_freq[0]
                        break
                if selected_word is None:
                    selected_word = random.choice(word_with_freq)[0][0]

            sentence_tokens.append(selected_word)
            sentence_length = sentence_length + 1
        
        string_full_text = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in sentence_tokens]).strip()      
        return string_full_text

    def perplexity(self, text):
        """
        calculate perplexity of a given text with  n-gram model
        
        """      
        text = '. '*(self.n-1) + text
        text_tokens = WordPunctTokenizer().tokenize(text.lower())
    
        inverse_prob_chain = 1
        for i in range(1,len(text_tokens)-self.n+2):
            word = text_tokens[-i]   
            context_token = text_tokens[-self.n-i+1:-i]
            context_str = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in context_token]).strip()
                    
            prob = self.prob(context_str, word)
            inverse_prob_chain = inverse_prob_chain * (1/prob)        
       
        perplexity = (inverse_prob_chain)**(1/(len(text_tokens)-self.n+1))

        return perplexity
        

In [5]:
#cuisine_lst = ['Chinese','Thai','American','Italian','Indian','Mediterranean','French']#

#for cuisine in cuisine_lst:

model = NgramModel(3)
model.update(get_text(df,'Chinese'))
print('vocab size',len(model.vocab))
print(model.generate_text('Heat the water',0,50))

vocab size 2605
heat the water and moisten all four edges of the wonton wrapper. fold the wrapper. fold the wrapper. fold the wrapper. fold the wrapper. fold the wrapper. fold the wrapper. fold the wrapper. fold the wrapper. fold the wrapper. fold the
