In [1]:
import numpy as np

def load_embs(fname):
    i2w = dict()
    embs = []
    s = 0
    V = 0
    with open(fname, 'r') as f:
        for line in f:
            p = line.strip().split()
            if len(p) == 2:
                V = int(p[0])  # Vocabulary
                s = int(p[1])  # embeddings size
            else:
                # assert len(p)== s+1
                w = "".join(p[0])
                e = [float(i) for i in p[1:]]
                if sum(e) == 0:
                    continue
                i2w[len(i2w)] = w
                embs.append(e)
    print("Load pre-trained word vectors from file %s:" % fname)
    print("\t%d vocabulary, %d dimension" % (len(i2w), s))
    return i2w, np.array(embs)

In [2]:
emb_file = "extracted_data/extracted_data/w2v_yelp100.pro.vec"
i2w, embs = load_embs(emb_file)

Load pre-trained word vectors from file extracted_data/extracted_data/w2v_yelp100.pro.vec:
	107906 vocabulary, 100 dimension


In [3]:
# Kmean sklearn 
from sklearn import cluster
NUM_CLUSTERS = 10
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(embs)
centroids = kmeans.cluster_centers_
print ("Centroids data")
print (centroids)

Centroids data
[[ 2.19043346e-02 -8.93139826e-02  2.17867010e-03 -8.77815894e-02
   9.72647739e-02  1.24524213e-01  1.11711879e-01  8.55922673e-02
   2.57387762e-02 -2.61264692e-02  3.20670004e-03  1.09941663e-01
   1.30834658e-01 -5.41213837e-02  3.15994140e-02  1.97418326e-03
  -1.32628777e-01  8.52273941e-02  8.26246463e-03 -5.81393140e-02
  -3.79049997e-02  1.46832088e-02 -4.89206352e-02  1.90164738e-01
   7.74297170e-02  9.82939276e-02 -7.05609447e-02 -1.40221378e-02
   7.16747230e-02 -8.47427771e-02 -9.92229052e-03  1.99011757e-01
  -1.45954124e-01  1.80451148e-01 -1.74129293e-01 -1.43031760e-02
  -4.36893240e-02  1.51244929e-01 -2.24697268e-02  3.75274980e-02
   1.01560299e-01  1.46247872e-02  1.60739606e-02 -5.17841803e-02
   1.07765099e-02 -6.29199219e-02 -8.15316028e-03 -1.03218608e-01
   1.23910052e-01  1.88625023e-01 -4.44014567e-03 -4.38455131e-03
  -1.87196312e-01 -9.12953863e-03  9.05287644e-02 -5.54353826e-02
   7.52839644e-02 -8.02369802e-04 -1.61723446e-01  1.21411415

In [4]:
def cosine_distance(W, ivocab, topic_embs, N=100):
    # normalize each word vector to unit variance
    d = (np.sum(W ** 2, 1) ** 0.5)
    # vec(a) <--- vec(a)/||vec(a)||
    W_norm = (W.T / d).T
    i = 0
    for topic in topic_embs:
        i += 1
        vec_result = np.array(topic)
        d = (np.sum(vec_result ** 2,) ** 0.5)
        # vec(b) <--- vec(b)/||vec(b)||
        vec_norm = (vec_result.T / d).T
        dist = np.dot(W_norm, vec_norm.T)
        a = np.argsort(-dist)[:N]
        neighbors = " ".join(["%s (%.4f);" % (ivocab[x], dist[x]) for x in a])
        print("topic_%d: %s\n" % (i, neighbors))

In [9]:
cosine_distance(embs, i2w, centroids, N=100)

topic_1: recommanderai (0.7205); nde (0.7183); feeney (0.7159); messer (0.7069); sympathie (0.7007); vielf (0.6890); oni (0.6884); prudence (0.6850); charlton (0.6728); ance (0.6714); regretterez (0.6710); dmb (0.6710); kubrick (0.6659); aufdringlich (0.6656); doyle (0.6638); chexican (0.6604); krabs (0.6600); servieren (0.6596); herm (0.6567); ponce (0.6553); zelda (0.6527); schlie (0.6524); kennt (0.6521); mooking (0.6521); wolverine (0.6511); illustrious (0.6508); amn (0.6505); vicino (0.6503); nev (0.6500); sahen (0.6500); teint (0.6488); vivek (0.6486); kitsun (0.6468); eugine (0.6468); outkast (0.6458); simpsons (0.6455); kritiker (0.6454); ltig (0.6443); danza (0.6441); riches (0.6439); braucht (0.6437); cartman (0.6431); ulysses (0.6426); gbc (0.6423); gretzkys (0.6422); goulet (0.6421); ub40 (0.6416); wacko (0.6405); kramer (0.6400); marquis (0.6393); nas (0.6383); governor (0.6379); kram (0.6367); efter (0.6364); turrets (0.6343); louper (0.6330); degrassi (0.6325); camaro (0