<a href="https://colab.research.google.com/github/tcb7351/tcb7351/blob/20231206_1/%E3%80%8C%5BNLP_2023%5D_Language_Models_ipynb%E3%80%8D%E7%9A%84%E5%89%AF%E6%9C%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# N-Gram Language Models

## Language Model for Generation

In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')

from collections import defaultdict
import random

def find_unigrams():
    unigrams = []
    for sent in brown.sents():
        for w1 in sent:
            unigrams.append(w1)
    return unigrams

def find_bigrams():
    bigrams = defaultdict(list)
    for sent in brown.sents():
        for w1, w2 in zip(sent, sent[1:]):
            bigrams[w1].append(w2)
    return bigrams

def find_trigrams():
    trigrams = defaultdict(list)
    for sent in brown.sents():
        for w1, w2, w3 in zip(sent, sent[1:], sent[2:]):
            trigrams[(w1, w2)].append(w3)
    return trigrams

unigrams = find_unigrams()
bigrams = find_bigrams()
trigrams = find_trigrams()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
cnt = 10
for w1, w2 in trigrams:
    cnt -= 1
    print(w1, w2, trigrams[(w1, w2)])
    if cnt == 0:
        break

The Fulton ['County']
Fulton County ['Grand', 'purchasing', 'general', ',', 'should', 'Jail']
County Grand ['Jury']
Grand Jury ['said', 'indictments']
Jury said ['Friday']
said Friday ['an', ',', 'that', '.']
Friday an ['investigation']
an investigation ['of', 'which', 'of', 'of', 'of', '.', 'with']
investigation of ["Atlanta's", 'the', 'the', "Lumumba's", 'a', 'the', 'the', 'respiratory', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
of Atlanta's ['recent']


In [None]:
def generate_using_unigrams(unigrams, length=0):
    results = []
    while len(results) < length or results[-1] != ".":
        results.append(random.choice(unigrams))
    return " ".join(results)

print(generate_using_unigrams(unigrams, 100))


funds requires time trip true and request seems name brought parties he You insults peddlers few two the teammates for aqueous seeing he'll . watery for leaped extravagant sequences most stumbled dominant the contrary collecting total the all , the their where had A. of publicly unforeseen for facts '' 1956 the 3 to consider teachers , refused , , be on Avoid it his came to of and quantity files governing a compact is one- of born in close ship believe What that rising using , his the on in by it They It Old Snelling . of this each , Vienna into that glass is an as given live of deserted so-called hundred whisky alarmed the literature roll by Lopez odd last in , .


In [None]:
def generate_using_bigrams(bigrams, prefix, length=0):
    results = prefix.split(" ")
    while results[-1] != "." or len(results) < length:
        if results[-1] not in bigrams:
            break
        results.append(random.choice(bigrams[results[-1]]))
    return " ".join(results)

print(generate_using_bigrams(bigrams, "Today is not", 100))


Today is not only 1 , prominent ads wear pink and often more than 400 B.C. it .


In [None]:
def generate_using_trigrams(trigrams, prefix, length=0):
    results = prefix.split(" ")
    while results[-1] != "." or len(results) < length:
        if tuple(results[-2:]) not in trigrams:
            break
        results.append(random.choice(trigrams[tuple(results[-2:])]))
    return " ".join(results)

print(generate_using_trigrams(trigrams, "Today is not", 100))


Today is not nearly as easy for the short whiskers , shaven polls , and to obtain a licensed drugless healer -- in fact turning the earth will be able to read Parisina , which includes the solemn assurance that Bridget returned to London , he wrote : `` in case you hit trouble .


## Language Model for Classification

In [None]:
import nltk
from nltk.corpus import movie_reviews

nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
training_data = defaultdict(list)
test_data = defaultdict(list)

random.seed(0)
for polarity in movie_reviews.categories():
    for fid in movie_reviews.fileids(polarity):
        if random.randrange(5) == 0:
            test_data[polarity].append([w for w in movie_reviews.words(fid)])
        else:
            training_data[polarity].append([w for w in movie_reviews.words(fid)])

neg
pos


In [None]:
from collections import Counter
import math

def build_lm(sents):
    trigrams = Counter()
    bigrams = Counter()
    unigrams = Counter()
    for sent in sents:
        for w1, w2, w3 in zip(sent, sent[1:], sent[2:]):
            trigrams[(w1, w2, w3)] += 1
            bigrams[(w1, w2)] += 1
            unigrams[w1] += 1
    return unigrams, bigrams, trigrams

def lm_log_pr(sent, unigrams, bigrams, trigrams):
    log_pr = 0
    for w1, w2, w3 in zip(sent, sent[1:], sent[2:]):
        log_pr += math.log((trigrams[(w1, w2, w3)] + 0.5) / (bigrams[(w1, w2)] + (0.5 * (len(unigrams)**2) )))
    return log_pr


In [None]:
pos_unigrams, pos_bigrams, pos_trigrams = build_lm(training_data["pos"])
neg_unigrams, neg_bigrams, neg_trigrams = build_lm(training_data["neg"])

In [None]:
correct, total = 0, 0

for polarity in ['neg', 'pos']:
    for sent in test_data[polarity]:
        if lm_log_pr(sent, neg_unigrams, neg_bigrams, neg_trigrams) > lm_log_pr(sent, pos_unigrams, pos_bigrams, pos_trigrams):
            prediction = 'neg'
        else:
            prediction = 'pos'
        if prediction == polarity:
            correct += 1
        total += 1

print("Acc: %d / %d = %g" % (correct, total, correct / total))

Acc: 235 / 422 = 0.556872
