## Importing libraries

In [651]:
import pandas as pd
import random
import nltk
from nltk.util import bigrams, trigrams
from nltk.tokenize import RegexpTokenizer
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline
from nltk.lm import KneserNeyInterpolated,Laplace

## Preprocessing the data

In [652]:
english = pd.read_csv("CONcreTEXT_trial_EN.tsv", sep='\t')
tokenizer = RegexpTokenizer(r'\w+')
englishText = []
for i in english["TEXT"]:
    englishText += [j.lower() for j in tokenizer.tokenize(i)]
italianText = []
for i in italian["TEXT"]:
    italianText += [j.lower() for j in tokenizer.tokenize(i)]
random.shuffle(englishText)
random.shuffle(italianText)

## Question1: Bigram

### Laplace

In [653]:
model = Laplace(2)

### Generating Bigram

In [654]:
english_bigram_list, eng_pad_list = padded_everygram_pipeline(2, englishText)

In [655]:
model.fit(english_bigram_list, eng_pad_list)

In [656]:
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 32 items>


In [657]:
model.score('h',['o'])

0.0017857142857142857

In [658]:
model.score('b',['e'])

0.004178272980501393

In [659]:
model.score('n',['o'])

0.10357142857142858

### Linear Interpolation

In [660]:
train_data, padded = padded_everygram_pipeline(2, englishText)
kney_model = KneserNeyInterpolated(2)
kney_model.fit(train_data,padded)

In [661]:
kney_model.score('x','p'.split())

0.00027985074626865673

In [662]:
kney_model.score('p','d'.split())

0.00020727040816326532

In [663]:
kney_model.score('h','t'.split())

0.2697482638888889

## Question 2: Text Generation

In [664]:
# function to generate a sentence
def generate(c):
    for t in model.generate(num_words=100,text_seed=[c],random_seed=None):
        if t == '</s>':
            t = ' '
        elif t == '<s>':
            t = ''
        else:
            t = t
        c+=t
    return c

In [665]:
print('Sentence1:', generate('m'))
print('Sentence2:', generate('b'))
print('Sentence3:', generate('t'))
print('Sentence4:', generate('d'))
print('Sentence5:', generate('o'))

Sentence1: mperte ous nde sece re mpersisisoubl ng almp g  henyour ectlyoff vatonveld nthan ore mathie t nto
Sentence2: bokatorinu oufofruthad a vemorst  llpa es cour a rin oveeancak oy  exa ucur tcut epar nencth ur  ren
Sentence3: t e lkit l wingery s cll ind anvetroo s ath s stestar  nd rth sirmeng tothor urk oartsopou youre in
Sentence4: d y  ute tr to nde rss ancachoonghas u pend ares it aifrexistin entsthe u f  uinghenstsiny s  om g
Sentence5: oooper  vayp e ime isispeses de ea odspareacen  o a halout esutarssourr icr encomarg as il  izza


## Question3: Trigram 

### Laplace

In [666]:
english_trigram_list, eng_pad_list = padded_everygram_pipeline(3, englishText)

In [667]:
model = Laplace(3)

In [668]:
model.fit(english_trigram_list, eng_pad_list)

In [669]:
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 32 items>


In [670]:
model.score('*','o n'.split())

0.011235955056179775

In [671]:
model.score('c','p q'.split())

0.03125

In [672]:
model.score('l','e h'.split())

0.030303030303030304

### Linear Interpolation

In [673]:
train_data, padded = padded_everygram_pipeline(3, englishText)
kney_model = KneserNeyInterpolated(3)
kney_model.fit(train_data,padded)

In [674]:
kney_model.score('*','o n'.split())

2.617252931323283e-06

In [696]:
kney_model.score('l','k h'.split())

ZeroDivisionError: float division by zero

In [697]:
kney_model.score('z','i t'.split())

ZeroDivisionError: float division by zero

### Text Generation

In [677]:
def generateTri(str):
    ch = [char for char in str]
    for token in model.generate(num_words=100,text_seed=ch,random_seed=None):
        #if token == '<s>':
        #    continue
        if token == '</s>':
            token = ' '
        elif token == '<s>':
            token = ''
        else:
            token= token
        str+=token
    return str

In [678]:
print('Sentence1: ', generateTri('th'))
print('Sentence2: ', generateTri('it'))
print('Sentence3: ', generateTri('on'))
print('Sentence4: ', generateTri('wo'))
print('Sentence5: ', generateTri('qu'))

Sentence1:  thonsibitheralf                                                                                       
Sentence2:  ith                                                                                                   
Sentence3:  on                                                                                                    
Sentence4:  world                                                                                                 
Sentence5:  qual                                                                                                  


# Extra Credits - Italian

## Question1: Bigram

### Laplace

In [679]:
model = Laplace(2)

### Generating Bigram

In [680]:
ita_bigram_list, ita_pad_list = padded_everygram_pipeline(2, italianText)

In [681]:
model.fit(ita_bigram_list, ita_pad_list)

In [682]:
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 36 items>


In [698]:
model.score('h','k'.split())

0.02702702702702703

In [699]:
model.score('*','b'.split())

0.010638297872340425

In [700]:
model.score('w','a'.split())

0.0013351134846461949

### Linear Interpolation

In [686]:
kney_model = KneserNeyInterpolated(2)
kney_model.fit(ita_bigram_list,ita_pad_list)

In [689]:
kney_model.score('q','k'.split())

ZeroDivisionError: float division by zero

In [701]:
kney_model.score('w','d'.split())

ZeroDivisionError: float division by zero

In [702]:
kney_model.score('p','x'.split())

ZeroDivisionError: float division by zero

## Question 2: Text Generation

In [None]:
# function to generate a sentence
def generateItalian(c):
    for t in model.generate(num_words=100,text_seed=[c],random_seed=None):
        if t == '</s>':
            t = ' '
        elif t == '<s>':
            t = ''
        else:
            t = t
        c+=t
    return c

In [None]:
print('Sentence1:', generateItalian('n'))
print('Sentence2:', generateItalian('i'))
print('Sentence3:', generateItalian('t'))
print('Sentence4:', generateItalian('o'))
print('Sentence5:', generateItalian('p'))

## Question3: Trigram 

### Laplace

In [None]:
ita_trigram_list, ita_pad_list = padded_everygram_pipeline(3, italianText)

In [None]:
model = Laplace(3)

In [None]:
model.fit(ita_trigram_list, ita_pad_list)

In [None]:
print(model.vocab)

In [703]:
model.score('é','o h'.split())

0.027777777777777776

In [710]:
model.score('*','o n'.split())

0.027777777777777776

In [707]:
model.score('l','h e'.split())

0.027777777777777776

### Linear Interpolation

In [None]:
kney_model = KneserNeyInterpolated(3)
kney_model.fit(ita_trigram_list,ita_pad_list)

In [None]:
kney_model.score('n',['a','n'])

In [None]:
kney_model.score('*',['q','m'])

In [None]:
kney_model.score('p',['e','x'])

### Text Generation

In [None]:
def generateTriIta(str):
    ch = [char for char in str]
    for token in model.generate(num_words=100,text_seed=ch,random_seed=None):
        #if token == '<s>':
        #    continue
        if token == '</s>':
            token = ' '
        elif token == '<s>':
            token = ''
        else:
            token= token
        str+=token
    return str

In [None]:
print('Sentence1: ', generateTriIta('lu'))
print('Sentence2: ', generateTriIta('po'))
print('Sentence3: ', generateTriIta('it'))
print('Sentence4: ', generateTriIta('ci'))
print('Sentence5: ', generateTriIta('os'))