In [43]:
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English
nlp = English(pipeline=[], max_length=5000000)

We will train our model on two novels: Emma and Persuasion. We will need to a SpaCy doc for
each text and merge them using the Doc.from_docs() function:
```python
def make_doc(files):
    docs = []
    for f in files:
        with open(f,’r’, encoding=’latin1’) as fn:
            docs.append(nlp(fn.read()))
    return Doc.from_docs(docs)
```

In [44]:
def make_doc(files):
    docs = []
    for f in files:
        with open(f,'r', encoding='latin1') as fn:
            docs.append(nlp(fn.read()))
    return Doc.from_docs(docs)

In [45]:
docs = make_doc(['/content/drive/MyDrive/Colab Notebooks/NLP_Tuần 3/Emma.txt','/content/drive/MyDrive/Colab Notebooks/NLP_Tuần 3/Persuasion.txt'])

In [46]:
def n_gram(tokens,n):
    # Write your code here
    tokens = [token.text for token in tokens]

    ngrams = []

    for i in range(len(tokens) - n + 1):
        ngram = ' '.join(tokens[i:i+n])
        ngrams.append(ngram)

    return ngrams

## 3.1 N-gram counts

Now we will need to store counts for each of the bigrams and trigrams in our text. Write a function called **get_ngram_counts**(docs)
that takes your training doc and uses your n_gram functions to calculate the n-gram counts within the training text.

In [47]:
def get_ngram_counts(docs):
    # Write your code here  
    a = {}
    b = {}
    c = {}

    unigrams = n_gram(docs, 1)
    for unigram in unigrams:
        if unigram in a:
            a[unigram] += 1
        else:
            a[unigram] = 1

    bigrams = n_gram(docs, 2)
    for bigram in bigrams:
        if bigram in b:
            b[bigram] += 1
        else:
            b[bigram] = 1

    trigrams = n_gram(docs, 3)
    for trigram in trigrams:
        if trigram in c:
            c[trigram] += 1
        else:
            c[trigram] = 1

    return a, b, c


In [48]:
unigram_coll, bigram_coll, trigram_coll = get_ngram_counts(docs)

In [49]:
bigram_coll

{'[ Emma': 1,
 'Emma by': 1,
 'by Jane': 4,
 'Jane Austen': 3,
 'Austen 1816': 1,
 '1816 ]': 1,
 '] \n\n\n': 1,
 '\n\n\n VOLUME': 1,
 'VOLUME I': 1,
 'I \n\n\n\n': 1,
 '\n\n\n\n CHAPTER': 55,
 'CHAPTER I': 3,
 'I \n\n\n': 3,
 '\n\n\n Emma': 9,
 'Emma Woodhouse': 5,
 'Woodhouse ,': 118,
 ', handsome': 4,
 'handsome ,': 10,
 ', clever': 1,
 'clever ,': 10,
 ', and': 2666,
 'and rich': 1,
 'rich ,': 6,
 ', with': 238,
 'with a': 173,
 'a comfortable': 6,
 'comfortable home': 3,
 'home \n': 8,
 '\n and': 921,
 'and happy': 11,
 'happy disposition': 1,
 'disposition ,': 6,
 ', seemed': 23,
 'seemed to': 81,
 'to unite': 1,
 'unite some': 1,
 'some of': 26,
 'of the': 970,
 'the best': 58,
 'best blessings': 2,
 'blessings \n': 2,
 '\n of': 468,
 'of existence': 4,
 'existence ;': 2,
 '; and': 1135,
 'and had': 58,
 'had lived': 9,
 'lived nearly': 1,
 'nearly twenty': 1,
 'twenty -': 12,
 '- one': 1,
 'one years': 1,
 'years in': 2,
 'in the': 737,
 'the world': 104,
 'world \n': 2,
 '\n wi

## 3.2 Next word probability

We now have all the pieces to calculate the probability of a word given the two words that come
before it. Write a function called **calc_ngram_prob**. It should take a trigram, a bigram Collection,
and a trigram Collection, and it should return the probability of the trigram according to the MLE
n-gram formula:

$
p(w_n|w_{n−2}w_{n−1}) =
\frac{C(w_{n−2}w_{n−1}w_n)}{C(w_{n−2}w_{n−1})}
$

For numerical stability, we will work with log probabilities. Wrap the division term in a call to
**math.log** to convert it from a probability to a log probability.

You will also need to handle cases where the bigram or trigram is not in the given collection.
Return negative infinity for out of vocabulary n-grams.

In [50]:
import math
def calc_ngram_prob(trigram ,bigram_coll, trigram_coll):
    w_n, w_n_2,w_n_1 = trigram
    # Write your code here
    if (w_n_2, w_n_1) not in bigram_coll or (w_n_2, w_n_1, w_n) not in trigram_coll:
        return float('-inf')
    
    bigram_count = bigram_coll[(w_n_2, w_n_1)]
    trigram_count = trigram_coll[(w_n_2, w_n_1, w_n)]
    prob = math.log(trigram_count / bigram_count)
    
    return prob

In [51]:
calc_ngram_prob(('jane','emma','by'),bigram_coll, trigram_coll)

-inf

## 3.3 Possible next words

Write a function that takes a sequence of text and returns a list of possible next words along
with their probabilities. Call it **get_possible_next_words**. It should take as arguments the sequence of text as a string, a Collection of bigrams, and a Collection of trigrams. It should use
**calc_ngram_prob** to calculate the probabilities of each of the candidates and return a list.

In [52]:
def get_possible_next_words(sentence,bigram_coll, trigram_coll):
    candidate_list = []
    # Write your code here
    
    tokens = sentence.split()

    if len(tokens) >= 2:
        w_n_2, w_n_1 = tokens[-2], tokens[-1]
    else:
        return candidate_list
    
    for word in docs.vocab:
        w_n = word.text
        
        trigram = (w_n, w_n_2, w_n_1)
        
        prob = calc_ngram_prob(trigram, bigram_coll, trigram_coll)
        
        candidate_list.append((w_n, prob))
    
    candidate_list.sort(key=lambda x: x[1], reverse=True)
    
    return candidate_list

In [53]:
get_possible_next_words('an agreeable',bigram_coll, trigram_coll)

[('nuthin', -inf),
 ('convincing', -inf),
 ('verified', -inf),
 ('So', -inf),
 ('counsel', -inf),
 ('sloop', -inf),
 ('indelible', -inf),
 ('Perfection', -inf),
 ('softly', -inf),
 ('mud', -inf),
 ('miserable;--', -inf),
 ('reasons--', -inf),
 ('strength', -inf),
 ('reprobating', -inf),
 ('refused', -inf),
 ('somehow', -inf),
 ('interruption', -inf),
 ('full', -inf),
 ('it.--Harriet', -inf),
 ('accommodations', -inf),
 ('next', -inf),
 ('beautifying', -inf),
 ('invalids', -inf),
 ('advising', -inf),
 ('calculation', -inf),
 ('flow', -inf),
 ('him!--Assured', -inf),
 ('brain', -inf),
 ('unknowingly', -inf),
 ('bodies', -inf),
 ('lines--', -inf),
 ('retired', -inf),
 ('court', -inf),
 ('feared', -inf),
 ('enemy?--', -inf),
 ('n.', -inf),
 ('indecision', -inf),
 ('fetching', -inf),
 ('elegance;--ease', -inf),
 ('abode', -inf),
 ('controul', -inf),
 ('blind', -inf),
 ('plea', -inf),
 ('deposit', -inf),
 ('X', -inf),
 ('unsuccessfully', -inf),
 ('foot', -inf),
 ('Weston.--So', -inf),
 ('day

## 3.4 Most likely next word

Now write a function wrapper function that uses **get_possible_next_words** to find the most likely
next word for a sequence of text. Call this **predict_next_word**. It should take as arguments the
sequence of text as a string, a Collection of bigrams, and a Collection of trigrams. It should return
the most likely next word.

**Check in**

The most likely next word following "an agreeable" should be "manner", with log probability -1.5
(22%).


In [54]:
def predict_next_word(sentence,bigram_coll, trigram_coll):
    # Write your code here
    candidate_list = get_possible_next_words(sentence, bigram_coll, trigram_coll)
    
    # Check if there are any candidates
    if len(candidate_list) > 0:
        # Return the most likely next word (first candidate)
        return candidate_list[0][0]
    else:
        return None

In [55]:
predict_next_word('an agreeable',bigram_coll, trigram_coll)

'nuthin'

## 3.5 Generating text

We can now generate text! Write a function called **generate_text**. It should take as arguments a
string representing the text to complete; n, a number of words to generate; a Collection of bigrams;
and a Collection of trigrams.

In [56]:
def generate_text(sentence,n, bigram_coll, trigram_coll):
    result = sentence
    for i in range(n):
        # Write your code here
        tokens = result.split()

        if len(tokens) >= 2:
            w_n_2, w_n_1 = tokens[-2], tokens[-1]
        else:
            return result

        next_word = predict_next_word(result, bigram_coll, trigram_coll)

        if next_word is not None:
            result += " " + next_word
        else:
            return result

    return result

In [57]:
generate_text('an agreeable',4,bigram_coll, trigram_coll)

'an agreeable nuthin nuthin nuthin nuthin'

## 3.6 Generating more text

It’s boring to always generate the most likely completion. Write a function called **sample_next_word**.
It should be like **predict_next_word**, except that it returns a word from the set of candidates produces by **get_possible_next_words** sampled according to its probability. You can use the **random.choices** function to help you sample.

**Note**: **random.choices** expects non-negative weights. Before passing in the probabilities as
weights, you will need to them back to normal probabilities using the **math.exp** function.

Next, augment your **generate_text** with an optional mode parameter. We will use **mode** to signal
whether we want the most likely next word (mode="top") or a word sampled according to its
probability (mode="random").

In [58]:
import random

In [59]:
def sample_next_word(sentence,bigram_coll, trigram_coll):
    # Write your code here
    candidates = get_possible_next_words(sentence, bigram_coll, trigram_coll)
    words, probabilities = zip(*candidates)
    probabilities = [math.exp(prob) for prob in probabilities]
    
    if sum(probabilities) > 0:
        sampled_word = random.choices(words, weights=probabilities, k=1)[0]
        return sampled_word
    else:
        return None

In [60]:
sample_next_word('an agreeable',bigram_coll, trigram_coll)

In [61]:
def generate_text(sentence,n, bigram_coll, trigram_coll, mode = 'top'):
    result = sentence
    # Write your code here
    for i in range(n):
        tokens = result.split()

        if len(tokens) >= 2:
            w_n_2, w_n_1 = tokens[-2], tokens[-1]
        else:
            return result

        if mode == "top":
            next_word = predict_next_word(result, bigram_coll, trigram_coll)
        elif mode == "random":
            next_word = sample_next_word(result, bigram_coll, trigram_coll)
        else:
            return result

        if next_word is not None:
            result += " " + next_word
        else:
            return result

    return result

In [62]:
generate_text("An agreeable", 10, bigram_coll, trigram_coll, mode="random")

'An agreeable'