# Word2Vec Unigram Testing

This Python Notebook is used for evaluation of the Word2Vec Unigram model. The section is broken down as follows:

- Find most similar words from the selected word
- Perform Syntactic Analysis
- Perform Semantic Analysis
- Find uncommon word among a list of words
- Find cosine similarity among two words
- Find the frequency count of a word
- Check if a word is in the model
- Print preview a list of words
- Evaluation on a K-Means + Word2Vec model

In [1]:
from gensim.models import Word2Vec as w2v



In [2]:
# Load Unigram model
FILE = "C:/Users/MyPC/Desktop/FYP/W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)

In [3]:
# Cell to find most similar words 
# One word for unigram: dragon, bleach, tottenham
# Two words for bigram: dragon_ball, barack_obama (UNDERSCORE NEEDED + BIGRAM MODEL LOADED)
model.most_similar("neuropsychopharmacology", topn=20)

[('biopsychology', 0.740115225315094),
 ('astrochemistry', 0.7391058206558228),
 ('neuroendocrinologist', 0.7296165227890015),
 ('nanoscience', 0.7265405058860779),
 ('neuropharmacology', 0.7247588038444519),
 ('saltzberg', 0.7157706618309021),
 ('ethnomusicology', 0.7156946659088135),
 ('psychobiology', 0.7154250144958496),
 ('nueroscience', 0.7147186994552612),
 ('neuropsychiatry', 0.7140935659408569),
 ('ichthyology', 0.710540235042572),
 ('molbio', 0.7056220769882202),
 ('oenology', 0.7056138515472412),
 ('antropology', 0.7041956186294556),
 ('biopsych', 0.7037904858589172),
 ('neuroengineering', 0.7037561535835266),
 ('nanoengineering', 0.7024978995323181),
 ('psycholinguistics', 0.7002543210983276),
 ('bioanthropology', 0.6995773315429688),
 ('christmann', 0.698868989944458)]

In [4]:
# Cell for semantic evaluation (Ex. King - man + woman is approximately equal to queen)
model.most_similar(positive=["tokyo","malaysia"], negative=["japan"])

[('lumpur', 0.6737101674079895),
 ('kuala', 0.6668090224266052),
 ('taipei', 0.6401477456092834),
 ('bangkok', 0.6113026142120361),
 ('penang', 0.5809809565544128),
 ('lampur', 0.5752942562103271),
 ('toyko', 0.5550657510757446),
 ('selangor', 0.5511509776115417),
 ('singapore', 0.5502724647521973),
 ('mumbai', 0.5481346249580383)]

In [5]:
# Cell for syntactic evaluation (Ex. walking - walk + swim is approximately equal to swimming)
model.most_similar(positive=["greenish","blue"], negative=["green"])

[('blueish', 0.7298511266708374),
 ('greyish', 0.7232707738876343),
 ('bluish', 0.7149738669395447),
 ('pinkish', 0.705883264541626),
 ('purplish', 0.7028074264526367),
 ('brownish', 0.6946163773536682),
 ('grayish', 0.6922476887702942),
 ('reddish', 0.6911346316337585),
 ('yellowish', 0.6770833134651184),
 ('whitish', 0.6669460535049438)]

In [6]:
# Cell to check which word doesn't match among a group of words
model.doesnt_match("blue green yellow apple".split())

'apple'

In [7]:
# Cell to check similarity among two words
model.similarity("titanic","rose")

0.24046405589195533

In [8]:
# Count number of times a specific word occured in the 2015 Dataset
word = model.vocab['difu']
type(word.count)

int

In [9]:
# Check if word (Unigram) is in model. It is case-sensitive
'Dragon' in model

False

In [10]:
# A brief review of words in the model
count = 70

for index, word in enumerate(model.vocab):
    print(index, word, model.vocab[word].count)
    if index == count:
        break

0 pindle 898980
1 gargants 739315
2 macromanaging 555703
3 ovechkins 976103
4 kwinana 914788
5 anesthesiologists 1059677
6 bakari 827309
7 lfd 1024609
8 attachtments 245850
9 humu 736076
10 petsitting 963812
11 chauvinistically 616659
12 threesoms 370288
13 friskie 398853
14 multigender 768048
15 melbz 529290
16 factoms 419914
17 seehr 392454
18 shushonova 165411
19 licanius 57613
20 preoperatively 204742
21 deinonychosauria 441678
22 dunwell 717708
23 darigans 388814
24 dueber 716938
25 cellules 171131
26 indiependence 194463
27 consumatorii 620487
28 kadmon 841884
29 shute 1032438
30 awft 269069
31 gwyndolins 393590
32 thuocnet 159589
33 succedendo 469452
34 subimgurs 167544
35 repat 840645
36 shahbandar 910035
37 mutinied 1007821
38 snivey 816848
39 perddy 307267
40 folkekirken 471786
41 humaniteit 50606
42 pavelectricity 247640
43 skadoodles 826554
44 higgenbotham 69005
45 uniwatch 446270
46 fuzziest 970457
47 alonf 275203
48 funzioni 816166
49 lovies 1007015
50 antonidus 942063
51

In [11]:
# Load the K-Means model
import pickle

# Specify the file
FILE = "C:/Users/MyPC/Desktop/FYP/K-Means Models/dict_500.pk"

# Load using pickle
word_centroid_map =  pickle.load(open(FILE,"rb"))

type(word_centroid_map)

dict

In [12]:
# Loop through the top N clusters

N = 100

for cluster in range(0, N):
    
    #Create array of words
    words = []
    
    for word, cluster_num in word_centroid_map.items():
        if cluster_num == cluster:
            words.append(word)
    
    print("CLUSTER NUMBER: %i" % (cluster))
    print("WORDS: ", words)
    print("\n\n")
        

CLUSTER NUMBER: 0
WORDS:  ['lockout', 'remainder', 'following', 'prep', 'stride', 'normal', 'intial', 'forth', 'yearlong', 'subsequent', 'phase', 'btwn', 'occasional', 'timespan', 'cyclical', 'preparation', 'consecutively', 'successive', 'longs', 'inbetween', 'interruptions', 'progression', 'onwards', 'rotation', 'washout', 'gradual', 'punctuated', 'lulls', 'swaps', 'blip', 'fastforward', 'incoming', 'speedy', 'long', 'throughs', 'heavy', 'initial', 'sustained', 'untimed', 'drastic', 'thoughout', 'immediate', 'uninterrupted', 'sporadic', 'reactivation', 'breakout', 'alternated', 'strokes', 'recent', 'short', 'timing', 'continuous', 'hiatus', 'brief', 'stints', 'simultaneous', 'turnaround', 'longish', 'occassional', 'tentative', 'buildups', 'interruption', 'current', 'incremental', 'intermittent', 'motions', 'ocassional', 'adjustment', 'readjustment', 'ebb', 'movements', 'preceeding', 'durning', 'permanant', 'aways', 'continous', 'inital', 'regular', 'permanent', 'recovery', 'reoccuring

In [15]:
# Find cluster number for a specific word
WORD = "apple"

word_centroid_map["fucker"]

326