In [2]:
# imports
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import brown
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
### whole brown corpus ###
corpus = [word for word in brown.words() if any(c.isalpha() for c in word)]

# sorted list for whole corpus
fdist = nltk.FreqDist([w.lower() for w in corpus])
sorted_whole = sorted(fdist, key=fdist.get, reverse=True)
# for word in sorted_whole[:10]:
#     print(word, fdist[word])

### sorted lists for two brown categories ###
lore = brown.words(categories='lore')
lore = [word for word in lore if any(c.isalpha() for c in word)]
adventure = brown.words(categories='adventure')
adventure = [word for word in adventure if any(c.isalpha() for c in word)]

loredist = nltk.FreqDist([w.lower() for w in lore])
advdist = nltk.FreqDist([w.lower() for w in adventure])

sorted_lore = sorted(loredist, key=loredist.get, reverse=True)
sorted_adv = sorted(advdist, key=advdist.get, reverse=True)

# for word in sorted_lore[:10]:
#     print(word, loredist[word])

# for word in sorted_adv[:10]:
#     print(word, advdist[word])

In [25]:
# number of tokens
whole = brown.words()
print(f"nr of tokens, with punctuation: {len(whole)}")  
print(f"nr of tokens, without punctuation: {len(corpus)}")

# number of types and words
print(f"nr of types: {len(set(whole))}") # print(f" {}")
print(f"nr of words: {len(corpus)}")

# average number of words per sentence; average word length
print(f"average nr of words per sentence: {len(corpus)/len(brown.sents())}")
count = 0
for word in corpus:
    count += len(word)
print(f"average word length: {count / len(corpus)}")

# POS tags
tagged = nltk.pos_tag(corpus)
# print(tagged[:10])
# print(brown.tagged_words(tagset='universal')[:10])
tag_fd = nltk.FreqDist(tag for (word, tag) in tagged)
print(f"10 most frequent POS-tags: {tag_fd.most_common()[:10]}")




[('NN', 156580), ('IN', 136387), ('DT', 114735), ('JJ', 80467), ('NNP', 76598), ('NNS', 54215), ('VBD', 47175), ('RB', 45062), ('PRP', 44082), ('CC', 37957), ('VB', 34480), ('VBN', 27168), ('TO', 26158), ('VBZ', 21594), ('VBG', 17967), ('PRP$', 17306), ('VBP', 16516), ('MD', 12459), ('CD', 6879), ('WDT', 5633), ('WRB', 4385), ('WP', 4259), ('RP', 4173), ('JJR', 3055), ('EX', 2335), ('JJS', 1956), ('RBR', 1755), ('NNPS', 1709), ('PDT', 952), ('RBS', 661), ('WP$', 250), ('FW', 172), ('UH', 23), ('POS', 10), ('$', 3), ("''", 3)]


In [None]:
# NOTE SLOW, DOES NOT WORK'

# plt.plot(list(fdist.keys()), list(fdist.values()), label='corpus', color='black')  # plot the frequency curve for the corpus
# plt.xlabel('Rank')
# plt.ylabel('Frequency')
# plt.legend()
# plt.show()


II: UNIGRAM MODEL
1. Creating the word_to_index dictionary

In [13]:
#!/usr/bin/env python3

"""
NLP A2: N-Gram Language Models

@author: Klinton Bicknell, Harry Eldridge, Nathan Schneider, Lucia Donatelli, Alexander Koller

DO NOT SHARE/DISTRIBUTE SOLUTIONS WITHOUT THE INSTRUCTOR'S PERMISSION
"""

word_index_dict = {}

# TODO: read brown_vocab_100.txt into word_index_dict

f = open("brown_vocab_100.txt", "r")
i = 0
for x in f:
  word_index_dict[x.rstrip().lower()] = i
  i += 1

# for item in word_index_dict.items():
#   print(item)

# TODO: write word_index_dict to word_to_index_100.txt

wf = open('word_to_index_100.txt','w')
for item in word_index_dict.items():
    string = str(item[0]) + ' ' + str(item[1])
    wf.write(string + '\n')
wf.close()


print(word_index_dict['all'])
print(word_index_dict['resolution'])
print(len(word_index_dict))

0
812
813


2. Building a MLE unigram model

In [None]:
#!/usr/bin/env python3

"""
NLP A2: N-Gram Language Models

@author: Klinton Bicknell, Harry Eldridge, Nathan Schneider, Lucia Donatelli, Alexander Koller

DO NOT SHARE/DISTRIBUTE SOLUTIONS WITHOUT THE INSTRUCTOR'S PERMISSION
"""

import numpy as np
from generate import GENERATE


vocab = open("brown_vocab_100.txt")

#load the indices dictionary
word_index_dict = {}
f = open("brown_vocab_100.txt", "r")
i = 0
for x in f:
  word_index_dict[x.rstrip()] = i
  i += 1

# get sentences
f = open("brown_100.txt")

# zeroes array
counts = np.zeros(len(word_index_dict))

# iterate through file and update counts
for x in f:
    words = x.split()
    for word in words:
        index = word_index_dict[word.lower()]
        counts[index] += 1
f.close()
print(counts)

# normalize and writeout counts. 
probs = counts / np.sum(counts)

wf = open('unigram_probs.txt','w')
for p in probs:
    wf.write(str(p) + '\n')
wf.close()

III. Bigram Models

3. Building an MLE bigram model 

In [46]:
#!/usr/bin/env python3

"""
NLP A2: N-Gram Language Models

@author: Klinton Bicknell, Harry Eldridge, Nathan Schneider, Lucia Donatelli, Alexander Koller

DO NOT SHARE/DISTRIBUTE SOLUTIONS WITHOUT THE INSTRUCTOR'S PERMISSION
"""

import numpy as np
from sklearn.preprocessing import normalize
from generate import GENERATE
import random
from sklearn.preprocessing import normalize


vocab = open("brown_vocab_100.txt")

#load the indices dictionary
word_index_dict = {}
f = open("brown_vocab_100.txt", "r")
i = 0
for x in f:
  word_index_dict[x.rstrip().lower()] = i
  i += 1


# get sentences
f = open("brown_100.txt")

# zeroes array
counts = np.zeros((len(word_index_dict), len(word_index_dict)))

# update counts for word pairs
previous = '<s>'
for x in f:
    words = x.split()
    for word in words:
        index_previous = word_index_dict[previous.lower()]
        index_current = word_index_dict[word.lower()]
        counts[index_previous][index_current] +=1
        previous = word
f.close()

# normalize counts
probs = normalize(counts, norm='l1', axis=1)
print(probs[word_index_dict['all'], word_index_dict['the']])
print(probs[word_index_dict['the'], word_index_dict['jury']])




# f.close()

1.0
0.08333333333333333


4. Add-Î± smoothing the bigram model

In [1]:
#!/usr/bin/env python3

"""
NLP A2: N-Gram Language Models

@author: Klinton Bicknell, Harry Eldridge, Nathan Schneider, Lucia Donatelli, Alexander Koller

DO NOT SHARE/DISTRIBUTE SOLUTIONS WITHOUT THE INSTRUCTOR'S PERMISSION
"""

import numpy as np
from sklearn.preprocessing import normalize
from generate import GENERATE
import random
from sklearn.preprocessing import normalize


vocab = open("brown_vocab_100.txt")

#load the indices dictionary
word_index_dict = {}
f = open("brown_vocab_100.txt", "r")
i = 0
for x in f:
  word_index_dict[x.rstrip().lower()] = i
  i += 1


# get sentences
f = open("brown_100.txt")

# zeroes array
counts = np.zeros((len(word_index_dict), len(word_index_dict)))
counts += 0.1
# update counts for word pairs
previous = '<s>'
for x in f:
    words = x.split()
    for word in words:
        index_previous = word_index_dict[previous.lower()]
        index_current = word_index_dict[word.lower()]
        counts[index_previous][index_current] +=1
        previous = word
f.close()

# normalize counts
probs = normalize(counts, norm='l1', axis=1)
print(probs[word_index_dict['all'], word_index_dict['the']])
print(probs[word_index_dict['the'], word_index_dict['jury']])




# f.close()

0.01336573511543135
0.05520438263801095
