In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
import codecs
nltk.data.path.append('.')

In [2]:
with codecs.open("fiction.tokenized.shuffled.txt", "r",  "utf_8_sig") as f:
    data = f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 108768304
First 300 letters of the data
-------


"Отож всі й поспішають захопити в м'якому\nТак\nЖах не випускав їх з своїх пазурів ані на мить\nЯкось ще в сьомому класі були вони разом на дитячому сеансі в кіно і тоді Віталієві випало сидіти поруч із Тонею коли загасили світло вона сиділа незвично притихла й здається не дихала а тоді вже коли фільм і"

-------
Last 300 letters of the data
-------


'е дусі\nВін готував бомбу\nЕ що там врешті сон\nА може\nВ цей час тихо по-злодійськи відчинилися двері і в землянку навшпиньках увійшов Безбородько\nТарас прийшов як тільки стало темніти\nПостукаєте тричі в бічні двері лівого крила\nНу а ти ж звернувся до Оксена Ягнич колинебудь хоч ногою на палубу ступав\n'

-------


In [3]:
def split_to_sentences(data):

    sentences = data.split('\n')
   
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    
    return sentences    

In [4]:
# test your code
x = """
Я маю ручку.\nЯ маю яблуко. \nAh\nЯблуко ручка.\n
"""
print(x)

split_to_sentences(x)


Я маю ручку.
Я маю яблуко. 
Ah
Яблуко ручка.




['Я маю ручку.', 'Я маю яблуко.', 'Ah', 'Яблуко ручка.']

In [5]:
def tokenize_sentences(sentences):
   
    tokenized_sentences = []

    for sentence in sentences:
        
        # Convert to lowercase letters
        sentence = sentence.lower()
        
        # Convert into a list of words
        tokenized = nltk.word_tokenize(sentence)
        
        # append the list of words to the list of lists
        tokenized_sentences.append(tokenized)
    
    
    return tokenized_sentences

In [6]:
sentences = ["Sky is blue.", "Leaves are green.", "Roses are red."]
tokenize_sentences(sentences)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green', '.'],
 ['roses', 'are', 'red', '.']]

In [7]:
def get_tokenized_data(data):
  
    sentences = split_to_sentences(data)
    
    # Get the list of lists of tokens by tokenizing the sentences
    tokenized_sentences = tokenize_sentences(sentences)
    
    
    return tokenized_sentences

In [8]:
x = "Небо зараз таке голубе.\nLeaves are green\nRoses are red."
get_tokenized_data(x)

[['небо', 'зараз', 'таке', 'голубе', '.'],
 ['leaves', 'are', 'green'],
 ['roses', 'are', 'red', '.']]

In [9]:
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [10]:
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

print("First training sample:")
print(train_data[0])
      
print("First test sample")
print(test_data[0])

1811548 data are split into 1449238 train and 362310 test set
First training sample:
['у', 'світлиці', 'стало', 'видно', 'хоч', 'голки', 'збирай']
First test sample
['висока', 'на', 'зріст', 'рівна', 'станом', 'але', 'не', 'дуже', 'тонка', 'з', 'кремезними', 'ногами', 'з', 'рукавами', 'позакачуваними', 'по', 'лікті', 'з', 'чорними', 'косами', 'вона', 'була', 'ніби', 'намальована', 'на', 'білій', 'стіні']


In [11]:
def count_words(tokenized_sentences):
          
    word_counts = {}
   
    for sentence in tokenized_sentences: 
        
        for token in sentence: 
            if token not in word_counts:
                word_counts[token] = 1
            
            else:
                word_counts[token] += 1
    
    return word_counts

In [12]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
count_words(tokenized_sentences)

{'sky': 1,
 'is': 1,
 'blue': 1,
 '.': 3,
 'leaves': 1,
 'are': 2,
 'green': 1,
 'roses': 1,
 'red': 1}

In [13]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    
    closed_vocab = []
    
    word_counts = count_words(tokenized_sentences)
    
    for word, cnt in word_counts.items(): 
        
        if cnt >= count_threshold:
        
            closed_vocab.append(word)
    
    return closed_vocab

In [14]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
tmp_closed_vocab = get_words_with_nplus_frequency(tokenized_sentences, count_threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'are']


In [15]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):

    vocabulary = set(vocabulary)
    
    replaced_tokenized_sentences = []

    for sentence in tokenized_sentences:
       
        replaced_sentence = []
       
        for token in sentence: 
            
            if token in vocabulary: 
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)        
        replaced_tokenized_sentences.append(replaced_sentence)
        
    return replaced_tokenized_sentences

In [16]:
tokenized_sentences = [["dogs", "run"], ["cats", "sleep"]]
vocabulary = ["dogs", "sleep"]
tmp_replaced_tokenized_sentences = replace_oov_words_by_unk(tokenized_sentences, vocabulary)
print(f"Original sentence:")
print(tokenized_sentences)
print(f"tokenized_sentences with less frequent words converted to '<unk>':")
print(tmp_replaced_tokenized_sentences)

Original sentence:
[['dogs', 'run'], ['cats', 'sleep']]
tokenized_sentences with less frequent words converted to '<unk>':
[['dogs', '<unk>'], ['<unk>', 'sleep']]


In [17]:
def preprocess_data(train_data, test_data, count_threshold):
    
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary)
    
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary)

    return train_data_replaced, test_data_replaced, vocabulary

In [18]:
tmp_train = [['sky', 'is', 'blue', '.'],
     ['leaves', 'are', 'green']]
tmp_test = [['roses', 'are', 'red', '.']]

tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_data(tmp_train, 
                                                           tmp_test, 
                                                           count_threshold = 1)

print("tmp_train_repl")
print(tmp_train_repl)
print()
print("tmp_test_repl")
print(tmp_test_repl)
print()
print("tmp_vocab")
print(tmp_vocab)

tmp_train_repl
[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]

tmp_test_repl
[['<unk>', 'are', '<unk>', '.']]

tmp_vocab
['sky', 'is', 'blue', '.', 'leaves', 'are', 'green']


In [19]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

In [20]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['у', 'світлиці', 'стало', 'видно', 'хоч', 'голки', 'збирай']

First preprocessed test sample:
['висока', 'на', 'зріст', 'рівна', 'станом', 'але', 'не', 'дуже', 'тонка', 'з', 'кремезними', 'ногами', 'з', 'рукавами', 'позакачуваними', 'по', 'лікті', 'з', 'чорними', 'косами', 'вона', 'була', 'ніби', 'намальована', 'на', 'білій', 'стіні']

First 10 vocabulary:
['у', 'світлиці', 'стало', 'видно', 'хоч', 'голки', 'збирай', 'ще', 'одна', 'будь']

Size of vocabulary: 336327


In [21]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
  
    n_grams = {}

    for sentence in data: 
        
        sentence =  [start_token] * n+ sentence + [end_token]
        
        sentence = tuple(sentence)
        
        m = len(sentence) if n==1 else len(sentence)-1
        for i in range(m): 

            n_gram = sentence[i:i+n]

            if n_gram in n_grams.keys(): 
            
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    
    return n_grams

In [22]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


In [23]:
def estimate_probability(word, previous_n_gram, 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
   
    previous_n_gram = tuple(previous_n_gram)
  
    previous_n_gram_count =  n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts  else 0

    denominator = previous_n_gram_count + k * vocabulary_size

    n_plus1_gram = previous_n_gram + (word,)
  
 
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts  else 0
        
  
    numerator = n_plus1_gram_count + k

    probability = numerator / denominator
    
    
    return probability

In [24]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
tmp_prob = estimate_probability("cat", "a", unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


In [25]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    
    previous_n_gram = tuple(previous_n_gram)
    
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [26]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
estimate_probabilities("a", unigram_counts, bigram_counts, unique_words, k=1)

{'dog': 0.09090909090909091,
 'like': 0.09090909090909091,
 'a': 0.09090909090909091,
 'i': 0.09090909090909091,
 'this': 0.09090909090909091,
 'is': 0.09090909090909091,
 'cat': 0.2727272727272727,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [27]:
trigram_counts = count_n_grams(sentences, 3)
estimate_probabilities(["<s>", "<s>"], bigram_counts, trigram_counts, unique_words, k=1)

{'dog': 0.09090909090909091,
 'like': 0.09090909090909091,
 'a': 0.09090909090909091,
 'i': 0.18181818181818182,
 'this': 0.18181818181818182,
 'is': 0.09090909090909091,
 'cat': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [28]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    vocabulary = vocabulary + ["<e>", "<unk>"]
    
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word:j for j, word in enumerate(vocabulary)}
    
    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [29]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)

print('bigram counts')
display(make_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,dog,like,a,i,this,is,cat,<e>,<unk>
"(like,)",0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(this,)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(is,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(<s>,)",0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
"(a,)",0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [30]:
print('\ntrigram counts')
trigram_counts = count_n_grams(sentences, 3)
display(make_count_matrix(trigram_counts, unique_words))


trigram counts


Unnamed: 0,dog,like,a,i,this,is,cat,<e>,<unk>
"(dog, is)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, i)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, this)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(like, a)",0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
"(<s>, <s>)",0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(i, like)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(this, dog)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(a, cat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(is, like)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [32]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,dog,like,a,i,this,is,cat,<e>,<unk>
"(like,)",0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(i,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(this,)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(is,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(dog,)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(<s>,)",0.090909,0.090909,0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909
"(a,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909


In [33]:
print("trigram probabilities")
trigram_counts = count_n_grams(sentences, 3)
display(make_probability_matrix(trigram_counts, unique_words, k=1))

trigram probabilities


Unnamed: 0,dog,like,a,i,this,is,cat,<e>,<unk>
"(dog, is)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, i)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, this)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(like, a)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909
"(<s>, <s>)",0.090909,0.090909,0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(i, like)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1
"(this, dog)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1
"(a, cat)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(is, like)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1


In [34]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):

    n = len(list(n_gram_counts.keys())[0]) 
    sentence = ["<s>"] * n + sentence + ["<e>"]
    
    sentence = tuple(sentence)
    
    N = len(sentence)
  
    product_pi = 1.0

    for t in range(n, N): # complete this line

        n_gram = sentence[t-n:t]
        
        word = sentence[t]
        
        probability = estimate_probability(word,n_gram, n_gram_counts, n_plus1_gram_counts, len(unique_words), k=1)
   
        product_pi *=  1 / probability

    perplexity = product_pi**(1/float(N))

    return perplexity

In [35]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)


perplexity_train1 = calculate_perplexity(sentences[0],
                                         unigram_counts, bigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence,
                                       unigram_counts, bigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 2.8040
Perplexity for test sample: 3.9654


In [36]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
  
    n = len(list(n_gram_counts.keys())[0]) 
    
    
    previous_n_gram = previous_tokens[-n:]

    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
  
    suggestion = None
    

    max_prob = 0
    
  
    for word, prob in probabilities.items():
        
       
        if start_with != None: 
            
          
            if not word.startswith(start_with): 

                continue  
        
     
        if prob > max_prob:
            
            suggestion = word
            
            max_prob = prob

    
    return suggestion, max_prob

In [37]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)

previous_tokens = ["i", "like"]
tmp_suggest1 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{tmp_suggest1[0]}` with a probability of {tmp_suggest1[1]:.4f}")

print()
# test your code when setting the starts_with
tmp_starts_with = 'c'
tmp_suggest2 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0, start_with=tmp_starts_with)
print(f"The previous words are 'i like', the suggestion must start with `{tmp_starts_with}`\n\tand the suggested word is `{tmp_suggest2[0]}` with a probability of {tmp_suggest2[1]:.4f}")

The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2727

The previous words are 'i like', the suggestion must start with `c`
	and the suggested word is `cat` with a probability of 0.0909


In [38]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [39]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
trigram_counts = count_n_grams(sentences, 3)
quadgram_counts = count_n_grams(sentences, 4)
qintgram_counts = count_n_grams(sentences, 5)

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts, qintgram_counts]
previous_tokens = ["i", "like"]
tmp_suggest3 = get_suggestions(previous_tokens, n_gram_counts_list, unique_words, k=1.0)

print(f"The previous words are 'i like', the suggestions are:")
display(tmp_suggest3)

The previous words are 'i like', the suggestions are:


[('a', 0.2727272727272727),
 ('a', 0.2),
 ('dog', 0.1111111111111111),
 ('dog', 0.1111111111111111)]

In [41]:
previous_tokens = ["він", "там", 'ваще', 'на']
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['він', 'там', 'ваще', 'на'], the suggestions are:


[('<unk>', 0.009441485855896232),
 ('у', 2.973279140365535e-06),
 ('у', 2.973279140365535e-06),
 ('у', 2.973279140365535e-06)]

In [42]:
previous_tokens = ["пішов", "на"]
tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with='х')

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest5)

The previous words are ['пішов', 'на'], the suggestions are:


[('хвилину', 0.0008106825551659593),
 ('хитрощі', 1.1878530150679155e-05),
 ('хоч', 2.973279140365535e-06),
 ('хоч', 2.973279140365535e-06)]

In [60]:
def suggest(phrase):
    words = nltk.word_tokenize(phrase)
    tmp_suggest = get_suggestions(words[:-1], n_gram_counts_list, vocabulary, k=1.0)
    display(tmp_suggest)

In [58]:
def suggest_with_beginning(phrase):
    words = nltk.word_tokenize(phrase)
    tmp_suggest = get_suggestions(words[:-1], n_gram_counts_list, vocabulary, k=1.0, start_with=words[-1])
    display(tmp_suggest)

In [112]:
suggest("у марусі по господарству було")

[('небагато', 0.005308279208390283),
 ('роботи', 0.000730827873640267),
 ('б', 0.000712827835640267),
 ('аж', 2.973279140365535e-06)]


In [114]:
suggest_with_beginning("йдемо пасти худ")

[('худобу', 2.3776265343121228e-05),
 ('художній', 2.973279140365535e-06),
 ('художній', 2.973279140365535e-06),
 ('художній', 2.973279140365535e-06)]