In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
import pandas as pd
import math
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize
nltk.download('words')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm,tqdm_pandas
from gensim.models import KeyedVectors

[nltk_data] Downloading package words to /home/yui/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yui/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### Glove 
- http://nlp.stanford.edu/data/glove.6B.zip

#### Word2Vec
- https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download
- Need a package named gensim `pip install gensim`.

In [2]:
def loadGlove(PATH):
    d,w2id,id2w = {},{},{}
    with open(PATH,'r') as f:
        lines = f.readlines()
        D = len(lines[0].split(" ")[1:])
        wmat = np.zeros((len(lines),D))
        for i,line in enumerate(tqdm(lines)):
            tokens = line.split(" ")
            word = tokens[0]
            w2id[word]=i
            id2w[i]=word
            vec = np.array(list(map(float,tokens[1:])))
            d[i]=vec
            wmat[i]=vec
    return d,w2id,id2w,wmat,D

#### Word Analogies with Embeddings
$$\begin{align*}
v-v_a &\approx v_b-v_c\\
\min_v d&=\min_v |v-v_a-v_b+v_c|\\
\end{align*}$$

In [3]:
class wordAnalogies:
    def __init__(self,PATH):
        self.PATH = PATH
        self.dGlove,self.w2id,self.id2w,\
            self.wmat,self.D = loadGlove(PATH)
    def find(self,a,b,c):
        v_ = self.dGlove[self.w2id[a]]+\
            self.dGlove[self.w2id[b]]-\
            self.dGlove[self.w2id[c]]
        v_ = v_.reshape(1,self.D)
        distances = pairwise_distances(v_,
            self.wmat).reshape(-1)
        closestId = np.argmin(distances)
        closestW = self.id2w[closestId]
        print("Closest word to {}+{}-{}: {}".format(\
                a,b,c,closestW))

In [4]:
PATH = "/home/yui/Documents/data/nlp/glove.6B/glove.6B.50d.txt"
wa = wordAnalogies(PATH)

100%|██████████| 400000/400000 [00:03<00:00, 126119.32it/s]


In [5]:
wa.find("woman","boy","girl")
wa.find("korean","japan","japanese")

Closest word to woman+boy-girl: man
Closest word to korean+japan-japanese: korea
