In [31]:
# ANALYSIS OF THE BROWN CORPUS
import nltk
from nltk.corpus import brown
from nltk import FreqDist
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

# nltk.download("brown")
# nltk.download("averaged_perceptron_tagger")

sentences = brown.sents()
tokens = brown.words()
words = [token.lower() for token in tokens if any(c.isalpha() for c in token)]
# filter out words that occur less than 10 times
freq_dist = FreqDist(words)
words_10 = [word for word, count in freq_dist.items() if count >= 10]
words_10_unique = set(words_10)
N = len(words) # number of words in the whole corpus
print(N)
word_index_dict = {}
for i, word in enumerate(words_10_unique):
    word_index_dict[word] = i

1005119


In [32]:
# Initialize the counts of all successive pairs (w1, w2) to a numpy matrix of zeros
counts = np.zeros((len(word_index_dict), len(word_index_dict)))

# Count the number of occurrences of all successive pairs (w1, w2) in sentences
for sentence in sentences:
    for i in range(len(sentence) - 1):
        w1 = sentence[i].lower()
        w2 = sentence[i + 1].lower()
        if w1 in word_index_dict and w2 in word_index_dict:
            counts[word_index_dict[w1]][word_index_dict[w2]] += 1


In [33]:
# calculate pmi
pmi = np.zeros((len(word_index_dict), len(word_index_dict)))

# Calculate the probabilities of words and word pairs
word_counts = np.array([freq_dist[word] for word in words_10_unique])

# Calculate the PMI values using vectorized operations
with np.errstate(divide='ignore', invalid='ignore'):
    pmi = np.log2(counts * N / (word_counts[:, np.newaxis] * word_counts[np.newaxis, :]))
    pmi[np.isnan(pmi)] = 0.0
    pmi[np.isneginf(pmi)] = 0.0  # Set -inf values to zero


In [34]:
# Find the indices of the 20 highest and lowest PMI values
highest_pmi_indices = np.unravel_index(np.argsort(pmi.ravel())[-20:], pmi.shape)
lowest_pmi_indices = np.unravel_index(np.argsort(pmi.ravel())[:20], pmi.shape)

print("20 word pairs with the highest PMI values:")
for i, j in zip(*highest_pmi_indices):
    w1 = list(word_index_dict.keys())[list(word_index_dict.values()).index(i)]
    w2 = list(word_index_dict.keys())[list(word_index_dict.values()).index(j)]
    print(f"{w1}, {w2}, PMI: {pmi[i, j]}")

print("\n20 word pairs with the lowest PMI values:")
for i, j in zip(*lowest_pmi_indices):
    w1 = list(word_index_dict.keys())[list(word_index_dict.values()).index(i)]
    w2 = list(word_index_dict.keys())[list(word_index_dict.values()).index(j)]
    print(f"{w1}, {w2}, PMI: {pmi[i, j]}")

20 word pairs with the highest PMI values:
nineteenth-century, immigration, PMI: 14.617006792305181
presiding, elder, PMI: 14.710116196696664
computing, allotments, PMI: 14.742537674389041
willie, mays, PMI: 14.779063550414154
peaceful, coexistence, PMI: 14.779063550414154
phonologic, subsystems, PMI: 14.880041198138976
carbon, tetrachloride, PMI: 14.954041779582752
kohnstamm, reactivity, PMI: 14.990567655607867
unwed, mothers, PMI: 15.032044291584025
anionic, binding, PMI: 15.102433619475423
drainage, ditch, PMI: 15.238495169051452
puerto, rico, PMI: 15.353972386471387
wtv, antigen, PMI: 15.4608875903879
el, paso, PMI: 15.479503268555247
lo, shu, PMI: 15.479503268555247
herald, tribune, PMI: 15.586418472471758
pathet, lao, PMI: 15.851472045942204
simms, purdew, PMI: 15.851472045942204
viet, nam, PMI: 15.938934887192543
hong, kong, PMI: 16.479503268555245

20 word pairs with the lowest PMI values:
the, a, PMI: -10.657060828365204
of, of, PMI: -10.365317843492496
the, and, PMI: -9.97196