## Finding entity classes in embeddings

In this notebook we're going to use embeddings to find entity classes and how they correlate with other things

In [None]:
%matplotlib inline
from sklearn import svm
from keras.utils import get_file
import os
import gensim
import numpy as np
import random
import requests
import geopandas as gpd
from IPython.core.pylabtools import figsize
figsize(12, 8)
import csv

In [None]:
MODEL    = 'GoogleNews-vectors-negative300.bin'
data_loc = '/home/smithw/Downloads/deep_learning' # WS: files not backed up here
zipped   = os.path.join(data_loc, MODEL + '.gz')  # WS mod
unzipped = os.path.join(data_loc, MODEL)  # WS
zipped, unzipped

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format(unzipped, binary=True)

In [None]:
model.most_similar(positive=['Germany'])

In [None]:
model.most_similar(positive=['Annita_Kirsten'])

No we'll create a training set with countries and non countries and get a support vector machine to learn the difference.

In [None]:
countries = list(csv.DictReader(open('data/countries.csv')))
len(countries), countries[:10]

In [None]:
# examples of country names
positive = [x['name'] for x in random.sample(countries, 40)]
positive[:10]

In [None]:
# exampls of not-country names
#negative = random.sample(model.vocab.keys(), 5000)  # WS vocab is OBE
negative = random.sample(model.index_to_key, 5000)  # WS this works
negative[:10]

In [None]:
len(model.key_to_index), len(model.index_to_key)

In [None]:
labelled = [(p, 1) for p in positive] + [(n, 0) for n in negative]
random.shuffle(labelled)
X = np.asarray([model[w] for w, l in labelled])
y = np.asarray([l for w, l in labelled])
X.shape, y.shape

In [None]:
labelled[:10]

In [None]:
TRAINING_FRACTION = 0.3
cut_off = int(TRAINING_FRACTION * len(labelled))
clf     = svm.SVC(kernel='linear')
clf.fit(X[:cut_off], y[:cut_off])

In [None]:
res = clf.predict(X[cut_off:])

missed = [country for (pred, truth, country) in 
 zip(res, y[cut_off:], labelled[cut_off:]) if pred != truth]

100 - 100 * float(len(missed)) / len(res), missed

In [None]:
X.mean(), X.std()

In [None]:
# note: if too many vectors are taken, RAM may fill up, with the word2vec dbase already
# taking up a lot of RAM; the full 3000000 is too large
all_predictions = clf.predict(model.vectors[:1000000]) # 1e6 takes 30s to run

In [None]:
all_predictions.shape

In [None]:
res = []
for word, pred in zip(model.index_to_key, all_predictions):  # WS index_to_key replaces index2word
    if pred:
        res.append(word)  # WS turned off break: see how many hits there are
        #if len(res) == 150:
        #    break
random.sample(res, 20) # can see the false alarms mixed in

In [None]:
len(res)

# START HERE

In [None]:
country_to_idx = {country['name']: idx for idx, country in enumerate(countries)}
country_vecs = np.asarray([model[c['name']] for c in countries])
country_vecs.shape

Quick sanity check to see what is similar to Canada:

In [None]:
dists = np.dot(country_vecs, country_vecs[country_to_idx['Canada']])
for idx in reversed(np.argsort(dists)[-10:]):
    print(countries[idx]['name'], dists[idx])

Ranking countries for a specific term:

In [None]:
def rank_countries(term, topn=10, field='name'):
    if not term in model:
        return []
    vec = model[term]
    dists = np.dot(country_vecs, vec)
    return [(countries[idx][field], float(dists[idx])) 
            for idx in reversed(np.argsort(dists)[-topn:])]

In [None]:
rank_countries('cricket')

Now let's visualize this on a world map:

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.head()

We can now plot some maps!

In [None]:
def map_term(term):
    d = {k.upper(): v for k, v in rank_countries(term, topn=0, field='cc3')}
    world[term] = world['iso_a3'].map(d)
    world[term] /= world[term].max()
    world.dropna().plot(term, cmap='OrRd')

map_term('coffee')

In [None]:
map_term('cricket')

In [None]:
map_term('China')

In [None]:
map_term('vodka')