In [19]:
#Step 1. Import NLTK in Python: http://www.nltk.org/. Download the Brown Corpus
#http://www.nltk.org/book/ch02.html for analyses below

import nltk
nltk.download('brown')
from nltk.corpus import brown
import csv 
from scipy import sparse, spatial, stats
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [20]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [21]:
#Step 2. Extract the 5000 most common English words (denoted by W) based on unigram
#frequencies in the Brown corpus. Report the 5 most and least common words you have found
#in the 5000 words. Update W by adding n words where n is the set of words in Table 1
#of RG65 that were not included in the top 5000 words from the Brown corpus. Denote the
#total number of words in W as |W|.

w_freq = nltk.FreqDist(word.lower() for word in brown.words())
w = [word[0] for word in w_freq.most_common(5000)]

top_5 = w[:5]
bottom_5 = w[-5:]

print("Top 5 common words: {0}".format(str(top_5)))
print("Bottom 5 common words: {0}".format(str(bottom_5)))

with open("/content/gdrive/My Drive/RG65.csv", newline="") as f:
    reader = csv.reader(f)
    rg65 = [(row[0].split()[1].split('_')[0], row[1].split()[1].split('_')[0]) for row in reader]

for pair in rg65:
  if pair[0] not in w: w.append(pair[0])
  if pair[1] not in w: w.append(pair[1])

n = len(w)
print(w[:5])

Top 5 common words: ['the', ',', '.', 'of', 'and']
Bottom 5 common words: ['cheek', 'awake', 'pursue', 'peered', 'crawled']
['the', ',', '.', 'of', 'and']


In [23]:
with open('/content/gdrive/My Drive/brown-RG65.csv', 'w') as f: 
    # using csv.writer method from CSV package 
    write = csv.writer(f) 
    write.writerow(w) 

In [22]:
#Step 3. Construct a word-context vector model (denoted by M1) by collecting bigram counts
#for words in W. The output should be a |W|×|W| matrix (consider using sparse matrices
#for better efficiency), where each row is a word in W, and each column is a context in W
#that precedes row words in sentences. For example, if the phrase taxi driver appears 5 times
#in the entire corpus, then row taxi and column driver should have a value of 5 in the matrix.

m1_freq = nltk.ConditionalFreqDist(nltk.bigrams(word.lower() for word in brown.words()))
bigram_probdist = nltk.ConditionalProbDist(m1_freq, nltk.DictionaryProbDist)

m1 = np.zeros((n,n))

for i, word1 in enumerate(w):
    for j, word2 in enumerate(w):
        m1[i][j] = m1_freq[word1][word2]

In [7]:
m1_plus = np.zeros((n,n))
for i, word1 in enumerate(w):
  for j, word2 in enumerate(w):
    word1_prob = w_freq.freq(word1)
    word2_prob = w_freq.freq(word2)
    if word1_prob == 0 or word2_prob == 0 : m1_plus[i][j] = 0
    else: m1_plus[i][j] = bigram_probdist[word1].prob(word2) / ( word1_prob* word2_prob)

In [8]:
from sklearn.decomposition import PCA

pca10 = PCA(n_components=10)
pca100 = PCA(n_components=100)
pca300 = PCA(n_components=300)

embedding10 = pca10.fit_transform(m1_plus)
embedding100 = pca100.fit_transform(m1_plus)
embedding300 = pca300.fit_transform(m1_plus)

In [9]:
from scipy import spatial

with open("/content/gdrive/My Drive/RG65.csv", newline="") as f:
    reader = csv.reader(f)
    rg65 = [(row[0].split()[1].split('_')[0], row[1].split()[1].split('_')[0], row[2]) for row in reader]

human_similarity = [float(row[2]) for row in rg65]

cos_similarities = []
for i in range(len(rg65)):
  word1 = rg65[i][0]
  word2 = rg65[i][1]
  idx_word1 = w.index(word1)
  idx_word2 = w.index(word2)
  word1_lsavec = embedding300[idx_word1]
  word2_lsavec = embedding300[idx_word2]

  similarity = 1 - distance.cdist([word1_lsavec], [word2_lsavec], metric='cosine')[0][0]
  cos_similarities.append(similarity)

lst = [(rg65[i][0], rg65[i][1], rg65[i][2], cos_similarities[i]) for i in range(len(rg65))]
df = pd.DataFrame(lst, columns=['word1', 'word2', 'human similarity', 'cosine similarity'])

print(df)

         word1      word2 human similarity  cosine similarity
0         cord      smile             0.02          -0.006084
1      rooster     voyage             0.04           0.919071
2         noon     string             0.04           0.001595
3        fruit    furnace             0.05          -0.002557
4    autograph      shore             0.06           0.149784
..         ...        ...              ...                ...
60     cushion     pillow             3.84          -0.047482
61    cemetery  graveyard             3.88          -0.032566
62  automobile        car             3.92           0.006757
63      midday       noon             3.94           0.093577
64         gem      jewel             3.94           0.999739

[65 rows x 4 columns]


In [10]:
# Pearson correlation between LSA vectors and human judgement of similarities. 

pearsonr = stats.pearsonr(cos_similarities, human_similarity)
print("Pearson Correlation of LSA is : {}".format(pearsonr))

Pearson Correlation of LSA is : (0.14298801225562885, 0.2558358862360781)


In [24]:
# Semantic and Syntactic analogy
analogy_file = '/content/gdrive/My Drive/word-test.v1.txt'
with open(analogy_file) as f:
    content = f.readlines()

analogy_pairs = []
semantic_pairs = []
syntactic_pairs = []

syntactic = False
for line in content:
  items = line.split()
  items = [item.lower() for item in items]
  if items[0] == "amazing": syntactic = True # start of syntactic pairs
  result = all(elem in w  for elem in items)
  if result and not syntactic: semantic_pairs.append(items)
  if result and syntactic: syntactic_pairs.append(items)

print("Total semantic pairs: ", len(semantic_pairs))
print("Total syntactic pairs: ", len(syntactic_pairs))

semantic_pairs = [pair for pair in semantic_pairs if len(pair) == 4]
syntactic_pairs = [pair for pair in syntactic_pairs if len(pair) == 4]

def get_accuracy(analogy_pairs):
  n = len(analogy_pairs)

  word1_vec = [embedding300[w.index(pair[0])] for pair in analogy_pairs]
  word2_vec = [embedding300[w.index(pair[1])] for pair in analogy_pairs]
  word3_vec = [embedding300[w.index(pair[2])] for pair in analogy_pairs]

  predict_vecs = [word2_vec[i] - word1_vec[i] + word3_vec[i] for i in range(n)]

  def closest_node(node, nodes):
      closest_index = distance.cdist([node], nodes, metric='cosine').argsort()
      return closest_index[0][1]

  count = 0
  for i in range(n):
    c = closest_node(predict_vecs[i], embedding300)
    if analogy_pairs[i][3] == w[c]: count = count + 1
  
  return count / n

Total semantic pairs:  163
Total syntactic pairs:  2045


In [25]:
semantic_accuracy = get_accuracy(semantic_pairs)
syntactic_accuracy = get_accuracy(syntactic_pairs)

print("Semantic Accuracy of LSA: ", semantic_accuracy)
print("Syntactic Accuracy of LSA: ", syntactic_accuracy)

Semantic Accuracy of LSA:  0.018518518518518517
Syntactic Accuracy of LSA:  0.006356968215158924
