<a href="https://colab.research.google.com/github/Sylamsh/nlp-practice/blob/main/Lab_6_language_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 12.1 MB/s 
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 63.8 MB/s 
Installing collected packages: regex, nltk
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 regex-2022.3.15


In [None]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import nltk
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!unzip /root/nltk_data/corpora/reuters.zip -d /root/nltk_data/corpora

### Build a trigram language model using a basic MLE estimator

In [None]:
## code referred from https://nlpforhackers.io/language-models/ and https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/

## empty default dict dictionary to initialise the language model
## the model is a keyed-value datastructure which is of the form model[key*][key] storing the default value of 0 

trigram_mle_model = defaultdict(lambda: defaultdict(lambda: 0))

## Count frequency of co-occurance  
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        trigram_mle_model[(w1, w2)][w3] += 1

# ## Transform the counts to probabilities
for w1_w2 in trigram_mle_model:
    total_count = float(sum(trigram_mle_model[w1_w2].values()))
    for w3 in trigram_mle_model[w1_w2]:
        trigram_mle_model[w1_w2][w3] /= total_count

In [None]:
## Retrieving the probabilities of every other word following the context word 'he will'
trigram_mle_model["he","will"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
            {'also': 0.021739130434782608,
             'ask': 0.06521739130434782,
             'attempt': 0.021739130434782608,
             'attend': 0.021739130434782608,
             'be': 0.06521739130434782,
             'decide': 0.021739130434782608,
             'definitely': 0.021739130434782608,
             'deliver': 0.043478260869565216,
             'do': 0.021739130434782608,
             'face': 0.021739130434782608,
             'have': 0.021739130434782608,
             'immediately': 0.043478260869565216,
             'leave': 0.021739130434782608,
             'listen': 0.021739130434782608,
             'look': 0.021739130434782608,
             'make': 0.043478260869565216,
             'meet': 0.06521739130434782,
             'not': 0.10869565217391304,
             'press': 0.043478260869565216,
             'propose': 0.08695652173913043,
             'put': 0.021739130434782608,
             'retir

In [None]:
## Retrieving the probabilities of the word 'ask' following the context word 'he will'
trigram_mle_model["he","will"]['ask']

0.06521739130434782

### Todo #1: Build a bigram language model using a basic MLE estimator

In [None]:
bigram_mle_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in reuters.sents():
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
        bigram_mle_model[w1][w2] += 1

for w1 in bigram_mle_model:
    total_count = float(sum(bigram_mle_model[w1].values()))
    for w2 in bigram_mle_model[w1]:
        bigram_mle_model[w1][w2] /= total_count

In [None]:
print(f"trigram mle score: {trigram_mle_model['he','will']['ask']}")
print("------------------------------------------------------------")
print(f"bigram mle score: {bigram_mle_model['he']['will']}")



trigram mle score: 0.06521739130434782
------------------------------------------------------------
bigram mle score: 0.012921348314606741


### Language model estimation using nltk library

In [None]:
from nltk.lm import MLE, Laplace, KneserNeyInterpolated
from nltk.util import everygrams

import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
## MLE estimator
## CODE referred from https://github.com/murthyrudra/IIITL_NLP_Lab/blob/main/Lab05/Lab05.ipynb
# https://www.nltk.org/api/nltk.lm.html
def build_mle_estimator(n):
    texts = [w.lower() for w in reuters.words()]
    
    # generate ngrams
    ngrams = list(everygrams(texts, max_len=n))

    # build ngram language models
    lm = MLE(n)
    lm.fit([ngrams], vocabulary_text=texts)
    print(lm.vocab)
    
    return lm

In [None]:
## the bigram MLE Language model
lm_mle_bigram = build_mle_estimator(2)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 31079 items>


In [None]:
## sample texts for testing the conditional probability of the next word give the context word/words
def print_lm_scores(lm, title=""):
    print("------------------ "+ title + " ------------------")
    print(f"Probability of 'he' followed by 'will': {lm.score(word='will', context=['he']):.5f}")
    print(f"Probability of 'america' followed by 'first': {lm.score(word='first', context=['america']):.5f}")

    print(f"Probability of 'you' followed by 'thank': {lm.score(word='thank', context=['you']):.5f}")
    print(f"Probability of 'thank' followed by 'you': {lm.score(word='you', context=['thank']):.5f}")


In [None]:
print_lm_scores(lm_mle_bigram, title="MLE")

------------------ MLE ------------------
Probability of 'he' followed by 'will': 0.01055
Probability of 'america' followed by 'first': 0.00858
Probability of 'you' followed by 'thank': 0.00000
Probability of 'thank' followed by 'you': 0.00000


#### TODO-2: Build a laplace estimator

Refer [this](https://www.nltk.org/api/nltk.lm.html)

In [None]:

def build_laplace_estimator(n):
    texts = [w.lower() for w in reuters.words()]
    
    # generate ngrams
    ngrams = list(everygrams(texts, max_len=n))

    # build ngram language models
    lm = Laplace(n)
    lm.fit([ngrams], vocabulary_text=texts)
    print(lm.vocab)
    return lm

In [None]:
## the bigram Laplace Language model
lm_laplace_bigram = build_laplace_estimator(2)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 31079 items>


In [None]:
## compare the scores of the bigram MLE and bigram laplace
print_lm_scores(lm_mle_bigram, title="MLE")
print_lm_scores(lm_laplace_bigram, title="Laplace")

------------------ MLE ------------------
Probability of 'he' followed by 'will': 0.01055
Probability of 'america' followed by 'first': 0.00858
Probability of 'you' followed by 'thank': 0.00000
Probability of 'thank' followed by 'you': 0.00000
------------------ Laplace ------------------
Probability of 'he' followed by 'will': 0.00154
Probability of 'america' followed by 'first': 0.00010
Probability of 'you' followed by 'thank': 0.00003
Probability of 'thank' followed by 'you': 0.00003


In [None]:
lm_lap_tri = build_laplace_estimator(3)
print_lm_scores(lm_lap_tri, title="Laplace")

<Vocabulary with cutoff=1 unk_label='<UNK>' and 31079 items>
------------------ Laplace ------------------
Probability of 'he' followed by 'will': 0.00154
Probability of 'america' followed by 'first': 0.00010
Probability of 'you' followed by 'thank': 0.00003
Probability of 'thank' followed by 'you': 0.00003


#### TODO-3: Build a interpolated kneyserney estimator

Refer [this](https://www.nltk.org/api/nltk.lm.html)

In [None]:
def build_kneyser_estimator(n):
    texts = [w.lower() for w in reuters.words()]
    
    # generate ngrams
    ngrams = list(everygrams(texts, max_len=n))

    # build ngram language models
    lm = KneserNeyInterpolated(n)
    lm.fit([ngrams], vocabulary_text=texts)
    print(lm.vocab)
    return lm

In [None]:
lm_kn_bi = build_kneyser_estimator(2)
lm_kn_tri = build_kneyser_estimator(3)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 31079 items>
<Vocabulary with cutoff=1 unk_label='<UNK>' and 31079 items>


In [None]:
print_lm_scores(lm_kn_bi, title="KneserNeyInterpolated_bi")
print_lm_scores(lm_kn_tri, title="KneserNeyInterpolated_tri")

------------------ KneserNeyInterpolated_bi ------------------
Probability of 'he' followed by 'will': 0.01055
Probability of 'america' followed by 'first': 0.00818
Probability of 'you' followed by 'thank': 0.00000
Probability of 'thank' followed by 'you': 0.00017
------------------ KneserNeyInterpolated_tri ------------------
Probability of 'he' followed by 'will': 0.01706
Probability of 'america' followed by 'first': 0.01488
Probability of 'you' followed by 'thank': 0.00000
Probability of 'thank' followed by 'you': 0.00017


In [None]:
##### You can generate words from the trained Language models using generate() in nltk
##### Refer https://www.nltk.org/api/nltk.lm.html and generate words using lm.generate() for all the language models

#### Try to generate the next 10 words given the context/seed word ['he','will'] for all the models

In [None]:
def generate_words(lm, num_words, title):
  print("------------------ "+ title + " ------------------")
  words = ['he','will']
  for i in range (0, num_words):
    temp = lm.generate(text_seed=[words[i], words[i+1]])
    words.append(temp)
  print(words)

In [None]:
generate_words(lm_mle_bigram, 10, title="MLE_bigram")
generate_words(lm_laplace_bigram, 10, title="Laplace_bigram")
generate_words(lm_lap_tri, 10, title="Laplace_trigram")

------------------ MLE_bigram ------------------
['he', 'will', 'grow', 'by', 'half', 'of', '2', '.', '47', 'cts', 'net', 'shr']
------------------ Laplace_bigram ------------------
['he', 'will', 'go', 'mainly', 'to', 'look', 'forward', 'to', '4', 'mln', 'vs', '92']
------------------ Laplace_trigram ------------------
['he', 'will', 'propose', 'a', 'formula', 'linked', 'to', 'production', 'quickly', 'through', 'developments', 'such']


In [None]:
# takes too much time
generate_words(lm_kn_bi, 10, title="KN_bigram")
generate_words(lm_kn_tri, 10, title="KN_trigram")

**Submit the colab notebook link in this [form](https://forms.gle/1f5zLKen8s3PaivK6) on or before 19/04/2022**