In [33]:
import pandas as pd
import numpy as np
import re
import collections
import math

from collections import defaultdict
from nltk.util import ngrams
import itertools


In [34]:
val_file_path='NLP-01-2-HW1-Data/valid.txt'
test_file_path='NLP-01-2-HW1-Data/test.txt'

mask_file_path='NLP-01-2-HW1-Data/mask.txt'
mask_gold_file_path='NLP-01-2-HW1-Data/mask_gold.txt'

incomplete_file_path='NLP-01-2-HW1-Data/incomplete.txt'

# read  and preprocess functions

In [35]:
def load_dataset(path):
    # load the dataset and return the txt as string
    with open(path, 'r') as file:
        txt=file.read()
    return txt

In [36]:
def preprocess_txt(txt):
    '''Preprocess the text by removing any unwanted characters,
     converting all the text to lowercase'''


    # remove any unwanted characters
    txt=txt.replace('\u200c',' ')
    txt=txt.replace('\n',' ')

    txt= re.sub(r'[^\w\s]','',txt)
    # convert all the english text to lowercase
    txt=txt.lower()
    
    
    return txt

In [37]:
def txt2word(txt):
    # split the text into individual words.
    words= txt.split(' ')
    # remove '_' in case it appears individually as a word
    if "_" in words:
        words.remove("_")
    # delete empty strings
    if "" in words:
        words.remove("")
        
    return words

In [38]:
def txt2sentence(txt):
    # split the text into individual sentence based on punctuation
    sentences = re.split(';|\.|\n|\?|\؟',txt)
    # delete empty strings
    if '' in sentences:
        sentences.remove('')
    
    return sentences

In [39]:
txt2sentence(' سلام? بچه.حالت خوبه;چه خبر ؟سالمی')

[' سلام', ' بچه', 'حالت خوبه', 'چه خبر ', 'سالمی']

# N grams functions

In [40]:
def generate_N_grams(words,ngram=2):
    # get list of words and create list of ngrams.
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans

In [41]:
words=txt2word('The above function inputs two parameters')
generate_N_grams(words,2)

['The above',
 'above function',
 'function inputs',
 'inputs two',
 'two parameters']

### laplce smoothing

In [42]:
def create_Ngram_counts(words):
    # Creating N_gram dictionary {word: count}
    Ngram_counts={}
    for word in words:
        if word not in Ngram_counts:
            Ngram_counts[word]=0
        Ngram_counts[word] += 1
        
    return Ngram_counts


In [43]:
def laplace_smoothing_counts(Ngram_counts,V,k=1):
    # Normalizing counts in Ngram dictionary to get probabilities and add lablce smoothing
    '''k is smoothing parameter 
    V is the vocabulary size
    (when k==1 it's laplace smoothing 
    otherwise it's add-k smoothing)'''
     
    smooth_Ngram=Ngram_counts.copy()
    total_grams=sum(Ngram_counts.values()) # 
    for word,value in Ngram_counts.items():
        if word not in Ngram_counts:
            smooth_Ngram[word]=k/(total_grams+(k*V))
        else:
            smooth_Ngram[word]=(value+k)/(total_grams+(k*V))
    return smooth_Ngram

In [44]:
def laplace_smoothing(trigram,tri_counts,V,k=1):
    """
    calculate add one smoothing for single Ngram
    if Ngram exist in dictionary --> retrun normalized count
    if Ngram does not exist --> return constant(k/ncount of all n-grams in courpus+(k*V)) 
    trigram: string of N words
        tri_count: N_gram dictionary
        V: length of vocab
    """
    total_grams=sum(tri_counts.values())
    if trigram  not in tri_counts:
        P_laplace=k/(total_grams+(k*V))
    else:
        P_laplace=(tri_counts[trigram]+k)/(total_grams+(k*V))
    return P_laplace

In [45]:
def score_sentence(sentence,n_counts,V,n):
    # calculate Laplace smoothing and score of sentence
    score=0.0
    
    for i in range(n-1, len(sentence)):
        ngram=' '.join(sentence[i-(n-1):i+1])
        
        p_laplace=laplace_smoothing(ngram,n_counts,V)
        

        score += math.log( p_laplace, 2)
    return score

In [46]:
def get_perplexity(corpus,n_counts,vocab_size,n):
    # calculate the perplexity using score of each sentence
    logprob=0
    word_count=0
    for sentence in corpus:
        logprob += score_sentence(sentence,n_counts,vocab_size,n)
        word_count += len(sentence) - (n-1)
    logprob /= word_count
    perplexity=2**(-logprob)
    return perplexity

# laod and process train and validation data

In [47]:
#load and preprocess train data
train_txt=load_dataset("NLP-01-2-HW1-Data/train.txt")
train_txt=preprocess_txt(train_txt)
train_words=txt2word(train_txt)


In [48]:
test_courpus=load_dataset("NLP-01-2-HW1-Data/test.txt")
preprocess_test_courpus=[preprocess_txt(sentence) for sentence in txt2sentence(test_courpus)]
test_words=[txt2word(sentence) for sentence in preprocess_test_courpus]

# load masked and incomplete dataset

In [49]:
mask_sentence=[]
with open(mask_file_path) as file_in:
    for line in file_in:
       
        words=(re.split('\u200c| ',(line.split('\t')[1])))
        for j in range(words.count('#')):
            if j==0:
                idx=0
                pre_idx=0
            else:
                pre_idx=idx+1
            idx=words.index('#',idx+1)
            # print(words[pre_idx:idx],j)
            mask_sentence.append(' '.join(words[pre_idx:idx]))
        

In [19]:
incomplete_data=[]
with open(incomplete_file_path) as file_in:
    for line in file_in:
        words=(re.split('\u200c| ',preprocess_txt(line.split('\t')[1])))
        incomplete_data.append(' '.join(words))
        

In [18]:
vocab_size=len(set(train_words))

#### print charchters in text

In [87]:
chars=(set([*train_txt]))
print(''.join(sorted(chars)))


 0123456789_abcdefghijklmnopqrstuvwxyzæðıɑɒɔəɛɨɪɾʁʃʊʏˈːαβγδεηθικλμνορςστυωабгдеиклнопрстучшъяіաբեէթիլխկհճմյնոչպստրւքօאבדהוחיכלמןנערשתءآأئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي١٢٥٧٨٩ٱپچژکگڵہۆیێۏە۰۱۲۳۴۵۶۷۸۹ईएखगदबमयरलशसকঠথদনবরசனᄀᄃᄋᄌᄎ하ᅢᅧᅩᅵᆫᆼⴰⴽⵎⵓⵔⵛ三中丹产佛党共勝北吉同唐国國土子字学官家振敏文方普朝李柴榴汉漢犬狮猪獅琵琶產田石秀羽胡臣船芸菩萄萨葡薩藩蜜術話語话语豊豬通郎酪長院馬马鵝鹅黨𐬀𐬎𐬙𐬚𐬭𐬰𐬱𐭥𐭧𐭩𐭯𐭱


# unigram

In [19]:
# words=txt2word(train_txt)
unigram_words=generate_N_grams(train_words,ngram=1)
uni_counts=create_Ngram_counts(unigram_words)
uni_smooth=laplace_smoothing_counts(uni_counts,vocab_size)

In [53]:
vocab=list(set(unigram_words))
# vocab.remove('')


In [57]:
uni_counts['ساخت'],uni_smooth['ساخت']

(227, 0.0005628573403509465)

In [58]:
 1/(sum(uni_smooth.values()) + len(uni_smooth))

3.671880737313652e-05

In [21]:
get_perplexity(test_words, uni_counts, vocab_size,1)

1998.5418528868581

In [201]:
def predict_next_word(sentence, uni_smooth, vocab_size):
    words = sentence.split()
    if len(words) < 1:
        return None
    else:
        uni_probs = [uni_smooth.get(word, 0.0) for word in vocab]
        pred = vocab[uni_probs.index(max(uni_probs))]
    return pred


In [202]:
for sentence in mask_sentence:
    predicted_word = predict_next_word(sentence, uni_smooth, vocab_size)
    if predicted_word=='':
        print('predicted_word:','Null')
    else:
        print('predicted_word:',predicted_word)
    print(' '.join([sentence,predicted_word]))
    print("_______________________")

predicted_word: و
سارتر به عنوان روشن فکری فعال از نظر و
_______________________
predicted_word: و
این تیم در سال ۱۸۹۹ و
_______________________
predicted_word: و
شد و تا به حال موفق به کسب یک عنوان و
_______________________
predicted_word: و
بزرگترین کلیسای و
_______________________
predicted_word: و
کریم خان پس از و
_______________________
predicted_word: و
بر دشمنان خود و
_______________________
predicted_word: و
عمومی ترین و
_______________________
predicted_word: و
جانوران که طی مراحل و
_______________________
predicted_word: و
نیز پیش از همه و
_______________________


In [203]:
for sentence in incomplete_data:
    predicted_word = predict_next_word(sentence,uni_smooth, vocab_size)
    if predicted_word=='':
        print('predicted_word:','Null')
    else:
        print('predicted_word:',predicted_word)
    print(' '.join([sentence,predicted_word]))
    print("_______________________")

predicted_word: و
در جریان انقلاب مشروطه ابتدا به عنوان یکی از نیروهای محمدعلی شاه با   و
_______________________
predicted_word: و
شرکت خدمات مالی و  و
_______________________
predicted_word: و
شخص موسی تورات را   و
_______________________
predicted_word: و
نام امازون را از یک لغت نامه   و
_______________________
predicted_word: و
تیم سپاهان اصفهان که در ابتدا شعبه  و
_______________________


# bigram

In [23]:
# words=txt2word(train_txt)
bigram_words=generate_N_grams(train_words,ngram=2)
bi_counts=create_Ngram_counts(bigram_words)
bi_smooth=laplace_smoothing_counts(bi_counts,vocab_size)

In [195]:
bi_smooth['به این'],bi_conts['به این']

(0.0005727334444238721, 231)

In [25]:
get_perplexity(test_words, bi_counts, vocab_size,2)

105707.71662553419

### predict next word using bigram model

In [196]:
def predict_next_word(sentence, bi_smooth, vocab_size):
    words = sentence.split()
    if len(words) < 2:
        return None
    elif len(words) == 2:
        bi = (words[0], words[1])
        bi_probs = [bi_smooth.get(' '.join([bi[0], word]),0.0) for word in vocab]
        pred = vocab[bi_probs.index(max(bi_probs))]
    else:
        bi = (words[-2], words[-1])
        bi_probs = [bi_smooth.get(' '.join([bi[0], word]), 0.0) for word in vocab]
        pred = vocab[bi_probs.index(max(bi_probs))]
    return pred


In [197]:
for sentence in mask_sentence:
    predicted_word = predict_next_word(sentence, bi_smooth, vocab_size)
    if predicted_word=='':
        print('predicted_word:','Null')
    else:
        print('predicted_word:',predicted_word)
    print(' '.join([sentence,predicted_word]))
    print("_______________________")

predicted_word: ان
سارتر به عنوان روشن فکری فعال از نظر ان
_______________________
predicted_word: های
این تیم در سال ۱۸۹۹ های
_______________________
predicted_word: سال
شد و تا به حال موفق به کسب یک عنوان سال
_______________________
predicted_word: شهر
بزرگترین کلیسای شهر
_______________________
predicted_word: از
کریم خان پس از از
_______________________
predicted_word: محمدشاه
بر دشمنان خود محمدشاه
_______________________
predicted_word: و
عمومی ترین و
_______________________
predicted_word: ان
جانوران که طی مراحل ان
_______________________
predicted_word: ان
نیز پیش از همه ان
_______________________


In [199]:
for sentence in incomplete_data:
    predicted_word = predict_next_word(sentence,bi_smooth, vocab_size)
    if predicted_word=='':
        print('predicted_word:','Null')
    else:
        print('predicted_word:',predicted_word)
    print(' '.join([sentence,predicted_word]))
    print("_______________________")

predicted_word: عباس
در جریان انقلاب مشروطه ابتدا به عنوان یکی از نیروهای محمدعلی شاه با   عباس
_______________________
predicted_word: و
شرکت خدمات مالی و  و
_______________________
predicted_word: به
شخص موسی تورات را   به
_______________________
predicted_word: ایتالیایی
نام امازون را از یک لغت نامه   ایتالیایی
_______________________
predicted_word: به
تیم سپاهان اصفهان که در ابتدا شعبه  به
_______________________


# Trigram

In [50]:

trigram_words=generate_N_grams(train_words,ngram=3)
tri_counts=create_Ngram_counts(trigram_words)
tri_smooth=laplace_smoothing_counts(tri_counts,vocab_size)

In [93]:
tri_smooth['شهر به این'],tri_counts['شهر به این']

(4.937369468294682e-06, 1)

# predict masked word trigram

In [51]:
def predict_next_word(sentence, tri_smooth, vocab_size):
    words = sentence.split()
    if len(words) < 2:
        # print('small')
        return None
    elif len(words) == 2:
        bi = (words[0], words[1])
        tri_probs = [tri_smooth.get(' '.join([bi[0], bi[1], word]),0.0) for word in vocab]
        pred= vocab[tri_probs.index(max(tri_probs))]
    else:
        tri = (words[-3], words[-2], words[-1])
        bi = (words[-2], words[-1])
        tri_probs = [tri_smooth.get(' '.join([tri[0], tri[1], word]),0.0) for word in vocab]
        bi_probs = [tri_smooth.get(' '.join([bi[0], bi[1], word]), 0.0) for word in vocab]
        probs = [p_tri * p_bi for p_tri, p_bi in zip(tri_probs, bi_probs)]
        pred= vocab[probs.index(max(probs))]
    # if pred=='':
    #     idx=np.array(probs).argsort()[-2]
    #     pred=vocab[idx]
        # print(max(probs))

    return pred



In [54]:
for sentence in mask_sentence:
    predicted_word = predict_next_word(sentence,tri_smooth, vocab_size)
    if predicted_word=='':
        print('predicted_word:','Null')
    else:
        print('predicted_word:',predicted_word)
    print(' '.join([sentence,predicted_word]))
    print("_______________________")

predicted_word: Null
سارتر به عنوان روشن فکری فعال از نظر 
_______________________
predicted_word: Null
این تیم در سال ۱۸۹۹ 
_______________________
predicted_word: Null
شد و تا به حال موفق به کسب یک عنوان 
_______________________
predicted_word: Null
بزرگترین کلیسای 
_______________________
predicted_word: Null
کریم خان پس از 
_______________________
predicted_word: Null
بر دشمنان خود 
_______________________
predicted_word: نشان
عمومی ترین نشان
_______________________
predicted_word: Null
جانوران که طی مراحل 
_______________________
predicted_word: این
نیز پیش از همه این
_______________________


# predict next word trigram

In [181]:
for sentence in incomplete_data:
    predicted_word = predict_next_word(sentence,tri_smooth, vocab_size)
    if predicted_word=='':
        print('predicted_word:','Null')
    else:
        print('predicted_word:',predicted_word)
    print(' '.join([sentence,predicted_word]))
    print("_______________________")

predicted_word: Null
در جریان انقلاب مشروطه ابتدا به عنوان یکی از نیروهای محمدعلی شاه با   
_______________________
predicted_word: Null
شرکت خدمات مالی و  
_______________________
predicted_word: Null
شخص موسی تورات را   
_______________________
predicted_word: Null
نام امازون را از یک لغت نامه   
_______________________
predicted_word: Null
تیم سپاهان اصفهان که در ابتدا شعبه  
_______________________


# testing trigram

In [29]:


perplexity=get_perplexity(test_words,tri_counts,vocab_size,3)
print(f"Perplexity: {perplexity}")

Perplexity: 383253.2208318584
