# Plan: Write very simple a probabilistic autocompletion machinery.
# 1) Grab a text
# 2) Split it into context -- (N-gram model)  (character level)
# 3) Train the model on the data
# 4) For the first try, we will do it for unigram model

In [1]:
#### Some necessary libraries to be imported, 
import requests
import os
import math
import typing
import random

In [7]:
#### ---- Exercise 2 ---- #####
#### Some NLP stuff we will be doing now, the following function will dowload all work of Shakespeare.
### Run the following function, do not change any line of it.
def download_text():
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    a = requests.get(data_url)
    with open("input.txt", mode = "w") as file:
        file.write(a.text)
    return a.text
try:
    text = download_text()
    print("Text downloaded succesfully!!")
except Exception as a:
    print(a)    

Text downloaded succesfully!!


In [138]:
class LLM:
    def __init__(self, text_file = "input.txt"):
        self.text = self.__read__(text_file)
        self.unique_chars = self.__get_unigrams__()
        
    def fit(self):
        try:
            self.prob_dict = self.get_prob_dist()
            print("Let's create some bizzare stuff!!!!")
        except Exception as e:
            print(f"Something is wrong with {e}")
    
    def __get_unigrams__(self):
        unique_chars = set()
        for char in self.text:
            unique_chars.add(char)
        self.unique_chars = unique_chars
        return unique_chars
    
    def __read__(self, text_file)->str:
        with open(text_file, mode = "r") as file:
            text = file.read()
        return text
    
    def get_prob_dist(self):
        freq_ = {char:{char_:0 for char_ in self.unique_chars} for char in self.unique_chars}
        
        N = len(self.text)
        for i, char in enumerate(self.text[:-1]):
            freq_[char][self.text[i+1]] += 1
            
        prob_dist = {keys:self.__normalize__(values) for keys, values in freq_.items()}
    
        return prob_dist
    
    def __normalize__(self, freq_count:dict)->dict:
        total_sum = sum(freq_count.values())
        N = len(freq_count.keys())        
        ϵ = 0.0 ## Smooting factor
        for keys, values in freq_count.items():
             freq_count[keys] = (freq_count[keys]+ϵ)/(total_sum+N*ϵ)
        return freq_count           
 
    def generate(self, prompt = "a", max_len = 1000):
        
        sentence:str = prompt
        
        while len(sentence) < max_len+1:
            prob_dict = self.prob_dict[sentence[-1]]
            next_char = random.choices(list(prob_dict.keys()), weights = list(prob_dict.values()))
            sentence += next_char[0]
        return sentence

In [139]:
llm = LLM(text_file = "input.txt")
llm.fit()

Let's create some bizzare stuff!!!!


In [142]:
print(llm.generate(prompt = "You shall not"))

You shall not y bue, mailepr, d! win be ke out fonwr;

KICE CANTy moio ffe.
N y haneeey, lad'd VICot quser'st but; t:
YComng tho can owine irand sar t t wnf d g denot bla pa he d atoucet y: teengro-trs, to d ce povintiod bruard cor we bonou totlofat wicheleasind us ond d thecak ol:

As ut owanoubeak bls
T:
Bus, fotothirer y iledorise th, sm,
Whe mad wiche thomsid eng I EONThef aurr m ty s t fe
Lot thick ne de blucall les hit;
Met, t it!
Bank ns oure far tst:

Whaner lopometoure be m acthe mamughin

Anteod s,
Papastofou chere n s
Sorou ise thead O:
ANTuleat a n honthaliteand hor t frsthe touch ms l.
Oro t-s t oing wel, acothinthen,
Wherevime thes bl Gou s
Jumis bak co f h dooo, thefourd hathe, n h prely favener aresthe oto:
Ay.
SE:
Sond t
QULe g.
ICENG t.
AMar
RI be m hach akea me tak isheerut t berup itat an, been.
F al ve baknon:
Gevinginiterdl ted lise fon ty,
Is thenderbly tisu howheach It rr----meno son y wavearsthore wertaurithas ppase t brs my s fo g wnoll s
Goreleed yer IAno d t

In [197]:
class LLM:
    def __init__(self,
                 look_behind_window = 2,
                 text_file = "input.txt"):
        self.n_gram = look_behind_window
        self.text = self.__read__(text_file)
        self.unique_ngrams = self.__get_ngrams__()
        

    
    def __get_ngrams__(self):
        unique_ngrams = set()
        N = len(self.text)
        for i in range(N):
            try:
                unique_ngrams.add(self.text[i:i+self.n_gram])
            except Exception as e:
                break
        return unique_ngrams
        
        
    
    def __read__(self, text_file)->str:
        with open(text_file, mode = "r") as file:
            text = file.read()
        return text
    
    def fit(self):
        try:
            self.prob_dict = self.get_prob_dist()
            print("Let's create some bizzare stuff!!!!")
        except Exception as e:
            print(f"Something is wrong with {e}")
    
    def get_prob_dist(self):
        freq_ = {n_gram:{} for n_gram in self.unique_ngrams}
        N = len(self.text)
        
        for i in range(N-(self.n_gram+1)):
            try:
                freq_[text[i:i+self.n_gram]][text[i+1:i+self.n_gram+1]] += 1
            except Exception as e:
                freq_[text[i:i+self.n_gram]][text[i+1:i+self.n_gram+1]] = 1
        
        
        prob_dist = {keys:self.__normalize__(values) for keys, values in freq_.items()}
        
        return prob_dist   
            
    def __normalize__(self, freq_count:dict)->dict:
        total_sum = sum(freq_count.values())
        N = len(freq_count.keys())        
        ϵ = 0.5 ## Smooting factor
        for keys, values in freq_count.items():
             freq_count[keys] = (freq_count[keys]+ϵ)/(total_sum+N*ϵ)
        return freq_count           
 
    def generate(self, prompt = "Fir", max_len = 1000):
        
        sentence:str = prompt
        
        while len(sentence) < max_len+1:
            prob_dict = self.prob_dict[sentence[-self.n_gram:]]
            next_char = random.choices(list(prob_dict.keys()), weights = list(prob_dict.values()))
            sentence += next_char[0][-1]
        return sentence

In [198]:
llm = LLM(look_behind_window=3)

In [199]:
llm.fit()

Let's create some bizzare stuff!!!!


In [202]:
llm.prob_dict

{'Jun': {'uni': 0.3, 'uno': 0.7},
 'ail': {'ily': 0.055408970976253295,
  'il,': 0.11345646437994723,
  'ils': 0.12928759894459102,
  "il'": 0.06596306068601583,
  'ili': 0.044854881266490766,
  'il!': 0.03430079155672823,
  'il ': 0.2717678100263852,
  'ilo': 0.12928759894459102,
  'il;': 0.023746701846965697,
  'il.': 0.044854881266490766,
  'il-': 0.0079155672823219,
  'il\n': 0.018469656992084433,
  'il:': 0.0079155672823219,
  'ile': 0.018469656992084433,
  'ilt': 0.013192612137203167,
  'il?': 0.013192612137203167,
  'ilm': 0.0079155672823219},
 "'pa": {'par': 1.0},
 ', a': {' an': 0.6283837056504599,
  ' aw': 0.01445466491458607,
  ' as': 0.14323258869908015,
  ' a ': 0.06438896189224705,
  ' ar': 0.026544021024967147,
  ' at': 0.022864651773981604,
  ' ap': 0.0023653088042049934,
  ' al': 0.039684625492772664,
  ' am': 0.008672798948751642,
  ' ai': 0.0007884362680683311,
  ' af': 0.011826544021024968,
  ' ag': 0.004467805519053876,
  ' av': 0.001314060446780552,
  ' a\n': 0.00

In [201]:
print(llm.generate(prompt = text[:10]))

First Citizen sationsion: bries and along'd him?

Seconding such I am
To me hear,
And they're charget hear breasy to you take on most fathe rull resh stong.

GONZALO:
Now, I don mysely me commercutor way thy then is effends such your could ever deed.

MERLAND:
He'll that int
My of Edwarranior I good used his ther solent.

MENES:
My eyes
Ere's resould
Or Did you her, betrothen,
Answelconce;
What purport's prink our love you takes!
Yet the a find;
And figuriolent you so be more to must a kings
Will bear. Thusband I pring,

KINGHAM:
He her. O, let the not fell, inderink meth slike frief still stancession my lord, I
hear,
Than with him!
Where beheart's cher, out
Of thou must count Hall being confess
Will minesses' excuse
Thou him, not so watch stive gentlemaid me less
That I businst ret counger duke me ack band cypretch you
A me upon!

ESCALUS:
Whichard for tonio.

CAPULET:
Wearcius parth,
Is clother's.
The cries and soluted and and by at a like dove me loves, fired me a hopention hear of 

In [184]:
"TRANIO" in text

True