In [49]:
import nltk
import pandas as pd
import numpy as np
import math
import random
import pickle
from scipy import stats
from collections import Counter

Read Files

In [2]:
with open('vocab-wordsim.txt') as f1:
    vocab_wordsim = f1.read().splitlines()

with open('vocab-25k.txt', encoding="utf8") as f2:
    vocab_25k = f2.read().splitlines()

with open('wiki-1percent.txt', encoding="utf8") as f3:
    wiki = f3.read()

with open('men.txt') as f4:
    men = f4.read().splitlines()

with open('simlex-999.txt') as f5:
    simlex = f5.read().splitlines()

Helper Functions

In [3]:
# Pad sentences and tokenize wiki corpus into list of tokens
def pad_and_tokenize(wiki):
    sentences = nltk.sent_tokenize(wiki)
    corpus = []
    for s in sentences:
        corpus += ['<s>'] + nltk.word_tokenize(s) + ['</s>']
    return corpus

In [4]:
# Distributional counting of the corpus given center words, context words, and context window
def counting(center, context, corpus, w):
    n = len(corpus)
    c = {}
    center_count = {}
    context_count = {}
    
    #iterate over the corpus once
    for index, token in enumerate(corpus):
        if token in center:
            range_ = list(range(max(0, index-w), index))+list(range(index+1, min(n, index+w+1)))
            for i in range_:
                word = corpus[i]
                if word in context:
                    if not token in c:
                        c[token] = {}
                        center_count[token] = 0
                    if not word in context_count:
                        context_count[word] = 0
                    c[token][word] = c[token].get(word, 0) + 1
                    center_count[token] += 1
                    context_count[word] += 1
    
    return c, center_count, context_count

In [5]:
# Transform human-annotated word similarities into a list of word pairs and a list of scores
def preprocess_ws(ws):
    pairs = []
    scores = []
    for entry in ws[1:]:
        lst = entry.split()
        pairs.append((lst[0], lst[1]))
        scores.append(float(lst[2]))
    return (pairs, scores)

In [6]:
men = preprocess_ws(men)
simlex = preprocess_ws(simlex)

In [7]:
# Calculate the L2 distance of context-word counts for a center word
def calc_vector_length(d):
    l = [count for count in d.values()]
    return np.linalg.norm(l)

In [8]:
# Calculate the cosine similarity of two center-word row vectors (represented by dicts)
def calc_cossim(d1, d2):
    n1 = len(d1)
    n2 = len(d2)    
    dot_product = 0
    
    if n1 < n2:
        for key in d1:
            if key in d2:
                dot_product += d1[key]*d2[key]    
    else:
        for key in d2:
            if key in d1:
                dot_product += d1[key]*d2[key]
    
    return dot_product/(calc_vector_length(d1)*calc_vector_length(d2))

In [9]:
# Returns spearman's rho between the specified human-annotated word similarities and the similarities based on the matrix in hand
def eval_ws(ws, c):
    pairs = ws[0]
    scores = ws[1]
    c_scores = []
    
    for pair in pairs:
        w1 = pair[0]
        w2 = pair[1]
        if (w1 not in c) or (w2 not in c):
            c_scores.append(0)
        else:
            c_scores.append(calc_cossim(c[w1], c[w2]))

    return stats.spearmanr(scores, c_scores)

In [10]:
# Generate PMI matrix based on the matrix of count
def calc_pmi(c, center_count, context_count):
    n = sum(list(center_count.values()))
    cpmi = {}
    for token, d in c.items():
        cpmi[token] = {}
        for word, count in d.items():           
            cpmi[token][word] = math.log(n*count/(center_count[token]*context_count[word]), 2)
    return cpmi

In [11]:
# print top n nearest neighbors
def get_nearest_neighbors(cpmi, keyword, n):
    if not keyword in cpmi:
        print("Invalid query word.")
        return
    
    d_key = cpmi[keyword]
    scores = []
    
    for token, d in cpmi.items():
        scores.append((calc_cossim(d_key, d), token))
    
    scores.sort(reverse=True)
    
    nn = [t[1] for t in scores[1:n+1]]
    
    return nn

### 1.1 Distributional Counting

In [12]:
corpus = pad_and_tokenize(wiki)

In [13]:
c_3, center_3, context_3 = counting(vocab_wordsim, vocab_25k, corpus, 3)

In [14]:
eval_ws(men, c_3)

SpearmanrResult(correlation=0.22814701095809897, pvalue=1.0076496657048044e-36)

In [15]:
eval_ws(simlex, c_3)

SpearmanrResult(correlation=0.056106496974349045, pvalue=0.07630628171308901)

### 1.2 Computing PMIs

In [16]:
cpmi_3 = calc_pmi(c_3, center_3, context_3)

In [17]:
eval_ws(men, cpmi_3)

SpearmanrResult(correlation=0.5331472836039355, pvalue=5.335378800701052e-220)

In [18]:
eval_ws(simlex, cpmi_3)

SpearmanrResult(correlation=0.2237608234397637, pvalue=8.400859146196789e-13)

### 1.3 Experimentation

In [19]:
c_1, center_1, context_1 = counting(vocab_wordsim, vocab_25k, corpus, 1)

In [20]:
cpmi_1 = calc_pmi(c_1, center_1, context_1)

In [21]:
eval_ws(men, c_1)

SpearmanrResult(correlation=0.2061997405743782, pvalue=3.6355429834129334e-30)

In [22]:
eval_ws(simlex, c_1)

SpearmanrResult(correlation=0.075084457147452, pvalue=0.017616981223936762)

In [23]:
eval_ws(men, cpmi_1)

SpearmanrResult(correlation=0.46398049004307396, pvalue=4.737791891909269e-160)

In [24]:
eval_ws(simlex, cpmi_1)

SpearmanrResult(correlation=0.2679049972472839, pvalue=6.992752723932315e-18)

In [25]:
c_6, center_6, context_6 = counting(vocab_wordsim, vocab_25k, corpus, 6)

In [26]:
cpmi_6 = calc_pmi(c_6, center_6, context_6)

In [27]:
eval_ws(men, c_6)

SpearmanrResult(correlation=0.23017586636479326, pvalue=2.294068212791827e-37)

In [28]:
eval_ws(simlex, c_6)

SpearmanrResult(correlation=0.03323812528777947, pvalue=0.2939338586562712)

In [29]:
eval_ws(men, cpmi_6)

SpearmanrResult(correlation=0.5266732025165936, pvalue=8.77862271056346e-214)

In [30]:
eval_ws(simlex, cpmi_6)

SpearmanrResult(correlation=0.17511437335867402, pvalue=2.5327475474192802e-08)

### 1.4 Analysis

In [31]:
c_1_, center_1_, context_1_ = counting(vocab_25k, vocab_25k, corpus, 1)

In [32]:
cpmi_1_ = calc_pmi(c_1_, center_1_, context_1_)

In [33]:
c_6_, center_6_, context_6_ = counting(vocab_25k, vocab_25k, corpus, 6)

In [34]:
cpmi_6_ = calc_pmi(c_6_, center_6_, context_6_)

### 1.4.1 Printing nearest neighbors

In [38]:
monster_1 = get_nearest_neighbors(cpmi_1_, "monster", 10)
print(monster_1)

['dragon', 'tyrant', 'creatures', 'monsters', 'jar', 'hornet', 'invaders', 'rhinoceros', 'robot', 'gangster']


In [39]:
monster_6 = get_nearest_neighbors(cpmi_6_, "monster", 10)
print(monster_6)

['evil', 'giant', 'creature', 'monsters', 'godzilla', 'dragon', 'dog', 'ghost', 'girl', 'horror']


In [36]:
# with open('cpmi_1.pkl', 'wb') as f1:
#     pickle.dump(cpmi_1_, f1)

with open('cpmi_6.pkl', 'wb') as f2:
    pickle.dump(cpmi_6_, f2)

In [37]:
with open('cpmi_1.pkl', 'rb') as f1:
     cpmi_1_ = pickle.load(f1)
    
# with open('cpmi_6.pkl', 'rb') as f2:
#     cpmi_6_ = pickle.load(f2)

### 1.4.2 POS Tag Similarity

In [40]:
# Tag the word on its own since the sequence means nothing here.
tokens = []
for word in vocab_25k:
    tokens += nltk.pos_tag([word])

In [41]:
def get_category(token):
    tag = token[1]
    if tag.startswith('N'):
        return 'noun'
    elif tag.startswith('V'):
        return 'verb'
    elif tag.startswith('J'):
        return 'adjective'
    elif tag.startswith('I'):
        return 'preposition'
    else:
        return 'others'

tokens_tagged = [(token[0], get_category(token)) for token in tokens]

In [42]:
tokens_tagged_dict = dict(tokens_tagged)
tokens_tagged_df = pd.DataFrame(tokens_tagged)

In [43]:
tokens_tagged_df.groupby([1]).count()

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
adjective,1606
noun,18991
others,1550
preposition,58
verb,2795


In [62]:
# randomly select 50 words each from nouns, verbs, adjectives, and prepositions
query_words = {}
for tag in ["noun", "adjective", "verb", "preposition"]:
    l = random.sample(tokens_tagged_df[tokens_tagged_df[1] == tag][0].tolist(), 50)
    query_words[tag] = l

In [45]:
def tag_neighbors(s):
    return [tokens_tagged_dict[word] for word in s] 

In [56]:
def neighbor_tag_counter(d, tag, w, c):
    for item in ["noun", "adjective", "verb", "preposition"]:
        prev =  d[tag][w].get(item, 0)
        d[tag][w][item] = prev + c[item]
    return d

In [64]:
counter = {}

for tag in ["noun", "adjective", "verb", "preposition"]:
    counter[tag] = {}
    counter[tag][1] = {}
    counter[tag][6] = {}
    words = query_words[tag]
    print(tag)
    for word in words:
        s1 = set(get_nearest_neighbors(cpmi_1_, word, 10))
        l1 = tag_neighbors(s1)
        c1 = Counter(l1)
        counter = neighbor_tag_counter(counter, tag, 1, c1)
        s6 = set(get_nearest_neighbors(cpmi_6_, word, 10))
        l6 = tag_neighbors(s6)
        c6 = Counter(l6)
        counter = neighbor_tag_counter(counter, tag, 6, c6)
        common = len(s1 & s6)
        print("\t"+word)
        print("\t\t"+"w=1: "+str(c1[tag])+"/10 nearest neighbors are "+tag)
        print("\t\t"+"w=6: "+str(c6[tag])+"/10 nearest neighbors are "+tag)
        print("\t\tCommon neighbors: "+str(common))

noun
	hardness
		w=1: 9/10 nearest neighbors are noun
		w=6: 10/10 nearest neighbors are noun
		Common neighbors: 0
	cum
		w=1: 8/10 nearest neighbors are noun
		w=6: 8/10 nearest neighbors are noun
		Common neighbors: 4
	cinemas
		w=1: 10/10 nearest neighbors are noun
		w=6: 9/10 nearest neighbors are noun
		Common neighbors: 2
	county
		w=1: 8/10 nearest neighbors are noun
		w=6: 9/10 nearest neighbors are noun
		Common neighbors: 4
	robbie
		w=1: 10/10 nearest neighbors are noun
		w=6: 9/10 nearest neighbors are noun
		Common neighbors: 0
	jerome
		w=1: 10/10 nearest neighbors are noun
		w=6: 10/10 nearest neighbors are noun
		Common neighbors: 2
	chose
		w=1: 4/10 nearest neighbors are noun
		w=6: 3/10 nearest neighbors are noun
		Common neighbors: 4
	resonance
		w=1: 9/10 nearest neighbors are noun
		w=6: 6/10 nearest neighbors are noun
		Common neighbors: 1
	drinks
		w=1: 10/10 nearest neighbors are noun
		w=6: 8/10 nearest neighbors are noun
		Common neighbors: 3
	kia
		w=1: 8/1

	arable
		w=1: 1/10 nearest neighbors are adjective
		w=6: 0/10 nearest neighbors are adjective
		Common neighbors: 0
	unorganized
		w=1: 3/10 nearest neighbors are adjective
		w=6: 0/10 nearest neighbors are adjective
		Common neighbors: 0
	identical
		w=1: 5/10 nearest neighbors are adjective
		w=6: 2/10 nearest neighbors are adjective
		Common neighbors: 1
	allied
		w=1: 6/10 nearest neighbors are adjective
		w=6: 0/10 nearest neighbors are adjective
		Common neighbors: 2
	twenty-first
		w=1: 1/10 nearest neighbors are adjective
		w=6: 0/10 nearest neighbors are adjective
		Common neighbors: 6
	african-american
		w=1: 5/10 nearest neighbors are adjective
		w=6: 1/10 nearest neighbors are adjective
		Common neighbors: 2
	lower
		w=1: 7/10 nearest neighbors are adjective
		w=6: 5/10 nearest neighbors are adjective
		Common neighbors: 4
	great
		w=1: 9/10 nearest neighbors are adjective
		w=6: 2/10 nearest neighbors are adjective
		Common neighbors: 2
	maltese
		w=1: 5/10 nearest neigh

	eating
		w=1: 6/10 nearest neighbors are verb
		w=6: 1/10 nearest neighbors are verb
		Common neighbors: 2
	outgoing
		w=1: 1/10 nearest neighbors are verb
		w=6: 3/10 nearest neighbors are verb
		Common neighbors: 1
	kicking
		w=1: 5/10 nearest neighbors are verb
		w=6: 2/10 nearest neighbors are verb
		Common neighbors: 1
	obsessed
		w=1: 6/10 nearest neighbors are verb
		w=6: 2/10 nearest neighbors are verb
		Common neighbors: 0
	encoded
		w=1: 5/10 nearest neighbors are verb
		w=6: 1/10 nearest neighbors are verb
		Common neighbors: 0
preposition
	on
		w=1: 8/10 nearest neighbors are preposition
		w=6: 3/10 nearest neighbors are preposition
		Common neighbors: 4
	by
		w=1: 2/10 nearest neighbors are preposition
		w=6: 0/10 nearest neighbors are preposition
		Common neighbors: 3
	into
		w=1: 3/10 nearest neighbors are preposition
		w=6: 4/10 nearest neighbors are preposition
		Common neighbors: 5
	with
		w=1: 3/10 nearest neighbors are preposition
		w=6: 2/10 nearest neighbors are 

In [65]:
counter

{'adjective': {1: {'adjective': 165,
   'noun': 243,
   'preposition': 1,
   'verb': 72},
  6: {'adjective': 92, 'noun': 339, 'preposition': 2, 'verb': 41}},
 'noun': {1: {'adjective': 18, 'noun': 444, 'preposition': 0, 'verb': 26},
  6: {'adjective': 25, 'noun': 432, 'preposition': 1, 'verb': 37}},
 'preposition': {1: {'adjective': 21,
   'noun': 60,
   'preposition': 187,
   'verb': 58},
  6: {'adjective': 35, 'noun': 159, 'preposition': 93, 'verb': 46}},
 'verb': {1: {'adjective': 30, 'noun': 176, 'preposition': 3, 'verb': 278},
  6: {'adjective': 36, 'noun': 343, 'preposition': 6, 'verb': 96}}}

In [74]:
# "consequence"

consequence
	w=1: ['mistake', 'tendency', 'interruption', 'devastation', 'outcome', 'auspices', 'implications', 'continuation', 'consequences', 'conclusion']
	w=6: ['risk', 'behaviour', 'affect', 'measures', 'cause', 'depends', 'regarding', 'outcomes', 'circumstances', 'patients']
	Common neighbors: 0


In [73]:
# "questionable", "modifying", "during"

questionable
	w=1: ['dubious', 'doubtful', 'problematic', 'biased', 'controversial', 'debatable', 'tendentious', 'unverified', 'unclear', 'encyclopedic']
	w=6: ['facts', 'citations', 'verify', 'verifiable', 'dubious', 'unsourced', 'questioned', 'opinion', 'npov', 'sourced']
	Common neighbors: 1
modifying
	w=1: ['referring', 'masculine', '1695', 'deleting', 'vandalizing', 'redirecting', 'entirety', 'restoring', 'recreating', 'synthesized']
	w=6: ['firmware', 'indefinite', 'emissions', 'wikilink', 'specifies', 'update', 'ethernet', 'hardware', 'reverting', 'vandals']
	Common neighbors: 0
during
	w=1: ['and', 'were', 'after', 'from', 'when', 'at', 'between', 'until', 'had', 'before']
	w=6: ['war', 'after', 'early', 'years', 'until', 'year', 'following', 'before', 'world', 'later']
	Common neighbors: 3


In [70]:
# "until", "southeastern"

until
	w=1: ['december', 'october', 'november', 'january', 'april', 'june', 'february', 'july', 'september', 'august']
	w=6: ['january', 'march', 'june', 'july', 'september', 'august', 'october', 'december', 'april', 'november']
	Common neighbors: 9
southeastern
	w=1: ['northeastern', 'northwestern', 'southwestern', 'eastern', 'southern', 'northern', 'western', 'southwest', 'south-central', 'southeast']
	w=6: ['northeastern', 'southwestern', 'southern', 'northeast', 'northwestern', 'southwest', 'eastern', 'western', 'northern', 'northwest']
	Common neighbors: 8


### 1.4.3 Word with Multiple Senses  

In [68]:
multiple = ["foot", "bill", "bit", "bat", "star", "seal", 
"can", "club", "bank", "bear", "pool", "pound",
"head", "bore", "current", "custom", "doctor","channel",
"novel", "patient", "plane", "strike", "like", "charge",
"minor", "suit", "trace", "chair", "company", "date"]

In [69]:
for word in multiple:
    print(word)
    l1 = get_nearest_neighbors(cpmi_1_, word, 10)
    l6 = get_nearest_neighbors(cpmi_6_, word, 10)
    common = len(set(l1) & set(l6))
    print("\t"+"w=1: "+str(l1))
    print("\t"+"w=6: "+str(l6))
    print("\tCommon neighbors: "+str(common))

foot
	w=1: ['feet', 'meters', 'metres', 'inches', 'kilometers', 'mile', 'infantry', 'kilometres', 'minute', 'leg']
	w=6: ['feet', 'regiment', 'front', 'floor', 'infantry', 'down', 'battalion', 'wall', 'leg', 'rear']
	Common neighbors: 3
bill
	w=1: ['john', 'william', 'david', 'jim', 'james', 'george', 'tom', 'mike', 'robert', 'bob']
	w=6: ['david', 'james', 'john', 'bob', 'william', 'michael', 'robert', 'george', 'smith', 'paul']
	Common neighbors: 7
bit
	w=1: ['very', 'somewhat', 'too', 'pretty', 'little', 'extremely', 'rather', 'slightly', 'quite', 'much']
	w=6: ['really', 'me', 'something', 'too', 'seems', "'m", 'think', 'someone', 'need', 'little']
	Common neighbors: 2
bat
	w=1: ['bats', 'crabs', 'rodents', 'jharkhand', 'equator', 'pitching', 'swallow', 'shortstop', 'fish', 'slayer']
	w=6: ['bats', 'innings', 'batting', 'wickets', 'bowler', 'ball', 'birds', 'bird', 'pitcher', 'softball']
	Common neighbors: 1
star
	w=1: ['stars', "'s", 'tv', "'", 'actor', 'radio', 'coach', 'sun', '(