In [48]:
import numpy as np

def load_embs(fname):
    i2w = dict()
    embs = []
    s = 0
    V = 0
    with open(fname, 'r') as f:
        for line in f:
            p = line.strip().split()
            if len(p) == 2:
                V = int(p[0])  # Vocabulary
                s = int(p[1])  # embeddings size
            else:
                # assert len(p)== s+1
                w = "".join(p[0])
                e = [float(i) for i in p[1:]]
                if sum(e) == 0:
                    continue
                i2w[len(i2w)] = w
                embs.append(e)
    print("Load pre-trained word vectors from file %s:" % fname)
    print("\t%d vocabulary, %d dimension" % (len(i2w), s))
    return i2w, np.array(embs, dtype="float32")

In [49]:
emb_file = "extracted_data/w2v_yelp100.pro.vec"
i2w, embs = load_embs(emb_file)

Load pre-trained word vectors from file extracted_data/w2v_yelp100.pro.vec:
	107906 vocabulary, 100 dimension


In [50]:
# Kmean sklearn 
from sklearn import cluster
NUM_CLUSTERS = 10
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(embs)
centroids = kmeans.cluster_centers_
print ("Centroids data")
print (centroids)

Centroids data
[[ 2.19306648e-02 -8.93415436e-02  2.22318619e-03 -8.77253860e-02
   9.72690433e-02  1.24623373e-01  1.11759856e-01  8.55648220e-02
   2.56669484e-02 -2.61704810e-02  3.15125287e-03  1.09941684e-01
   1.30727902e-01 -5.40892407e-02  3.16179991e-02  1.93239748e-03
  -1.32611424e-01  8.52152035e-02  8.31503421e-03 -5.81852794e-02
  -3.79400998e-02  1.47331692e-02 -4.89456058e-02  1.90182433e-01
   7.74918720e-02  9.82238650e-02 -7.05692470e-02 -1.39888413e-02
   7.16589838e-02 -8.47664699e-02 -9.95395705e-03  1.98989034e-01
  -1.45956993e-01  1.80505991e-01 -1.74119979e-01 -1.43202059e-02
  -4.37328331e-02  1.51269630e-01 -2.25278996e-02  3.75058651e-02
   1.01611681e-01  1.46824569e-02  1.60583705e-02 -5.17443568e-02
   1.08173862e-02 -6.29077628e-02 -8.19425285e-03 -1.03113428e-01
   1.23976976e-01  1.88611865e-01 -4.43019345e-03 -4.35511395e-03
  -1.87213659e-01 -9.13026184e-03  9.05444250e-02 -5.54074943e-02
   7.53278434e-02 -8.10310245e-04 -1.61729693e-01  1.21424414

In [65]:
np.savetxt('./kmean_10centroids.txt', centroids)

In [56]:
centroids_2 = np.loadtxt('./extracted_data/kmean_10centroids.txt', dtype="float32")

In [66]:
def cosine_distance(W, ivocab, topic_embs, N=50):
    # normalize each word vector to unit variance
    d = (np.sum(W ** 2, 1) ** 0.5)
    # vec(a) <--- vec(a)/||vec(a)||
    W_norm = (W.T / d).T
    i = 0
    for topic in topic_embs:
        i += 1
        vec_result = np.array(topic)
        d = (np.sum(vec_result ** 2,) ** 0.5)
        # vec(b) <--- vec(b)/||vec(b)||
        vec_norm = (vec_result.T / d).T
        dist = np.dot(W_norm, vec_norm.T)
        a = np.argsort(-dist)[:N]
        neighbors = " ".join(["%s (%.4f);" % (ivocab[x], dist[x]) for x in a])
        print("topic_%d: %s\n" % (i, neighbors))

In [67]:
cosine_distance(embs, i2w, centroids, N=100)

topic_1: recommanderai (0.7205); nde (0.7183); feeney (0.7160); messer (0.7070); sympathie (0.7008); vielf (0.6890); oni (0.6884); prudence (0.6850); charlton (0.6727); ance (0.6714); regretterez (0.6711); dmb (0.6709); kubrick (0.6658); aufdringlich (0.6656); doyle (0.6638); chexican (0.6604); krabs (0.6600); servieren (0.6596); herm (0.6567); ponce (0.6553); zelda (0.6526); schlie (0.6524); mooking (0.6521); kennt (0.6521); wolverine (0.6511); illustrious (0.6508); amn (0.6506); vicino (0.6503); sahen (0.6501); nev (0.6500); teint (0.6488); vivek (0.6487); kitsun (0.6469); eugine (0.6468); outkast (0.6458); simpsons (0.6455); kritiker (0.6454); ltig (0.6443); danza (0.6441); riches (0.6438); braucht (0.6437); cartman (0.6431); ulysses (0.6425); gbc (0.6422); gretzkys (0.6421); goulet (0.6421); ub40 (0.6416); wacko (0.6405); kramer (0.6400); marquis (0.6393); nas (0.6384); governor (0.6379); kram (0.6368); efter (0.6364); turrets (0.6343); louper (0.6330); degrassi (0.6325); camaro (0

In [12]:
# source: https://www.ranks.nl/stopwords
sws = "a able about above abst accordance according accordingly across act actually added adj affected affecting " \
      "affects after afterwards again against ah all almost alone along already also although always am among " \
      "amongst an and announce another any anybody anyhow anymore anyone anything anyway anyways anywhere " \
      "apparently approximately are aren arent arise around as aside ask asking at auth available away awfully b " \
      "back be became because become becomes becoming been before beforehand begin beginning beginnings begins " \
      "behind being believe below beside besides between beyond biol both brief briefly but by c ca came can cannot " \
      "can't cause causes certain certainly co com come comes contain containing contains could couldnt d date did " \
      "didn't different do does doesn't doing done don't down downwards due during e each ed edu effect eg eight " \
      "eighty either else elsewhere end ending enough especially et et-al etc even ever every everybody everyone " \
      "everything everywhere ex except f far few ff fifth first five fix followed following follows for former " \
      "formerly forth found four from further furthermore g gave get gets getting give given gives giving go goes " \
      "gone got gotten h had happens hardly has hasn't have haven't having he hed hence her here hereafter hereby " \
      "herein heres hereupon hers herself hes hi hid him himself his hither home how howbeit however hundred i id ie " \
      "if i'll im immediate immediately importance important in inc indeed index information instead into invention " \
      "inward is isn't it itd it'll its itself i've j just k keep keeps kept kg km know known knows l largely last " \
      "lately later latter latterly least less lest let lets like liked likely line little 'll look looking looks " \
      "ltd m made mainly make makes many may maybe me mean means meantime meanwhile merely mg might million miss " \
      "ml more moreover most mostly mr mrs much mug must my myself n na name namely nay nd near nearly necessarily " \
      "necessary need needs neither never nevertheless new next nine ninety no nobody non none nonetheless noone " \
      "nor normally nos not noted nothing now nowhere o obtain obtained obviously of off often oh ok okay old " \
      "omitted on once one ones only onto or ord other others otherwise ought our ours ourselves out outside " \
      "over overall owing own p page pages part particular particularly past per perhaps placed please plus " \
      "poorly possible possibly potentially pp predominantly present previously primarily probably promptly proud " \
      "provides put q que quickly quite qv r ran rather rd re readily really recent recently ref refs regarding " \
      "regardless regards related relatively research respectively resulted resulting results right run s said " \
      "same saw say saying says sec section see seeing seem seemed seeming seems seen self selves sent seven " \
      "several shall she shed she'll shes should shouldn't show showed shown showns shows significant " \
      "significantly similar similarly since six slightly so some somebody somehow someone somethan something " \
      "sometime sometimes somewhat somewhere soon sorry specifically specified specify specifying still stop " \
      "strongly sub substantially successfully such sufficiently suggest sup sure t take taken taking tell tends " \
      "th than thank thanks thanx that that'll thats that've the their theirs them themselves then thence there " \
      "thereafter thereby thered therefore therein there'll thereof therere theres thereto thereupon there've " \
      "these they theyd they'll theyre they've think this those thou though thoughh thousand throug through " \
      "throughout thru thus til tip to together too took toward towards tried tries truly try trying ts twice " \
      "two u un under unfortunately unless unlike unlikely until unto up upon ups us use used useful usefully " \
      "usefulness uses using usually v value various 've very via viz vol vols vs w want wants was wasnt way we " \
      "wed welcome we'll went were werent we've what whatever what'll whats when whence whenever where whereafter " \
      "whereas whereby wherein wheres whereupon wherever whether which while whim whither who whod whoever " \
      "whole who'll whom whomever whos whose why widely willing wish with within without wont words world " \
      "would wouldnt www x y yes yet you youd you'll your youre yours yourself yourselves you've z zero"
sw_list = set(sws.split())

In [22]:
def load_idf(fname):
    i2w = dict()

    with open(fname, 'r') as f:
        p = next(f).split()
        for line in f:
            p = line.strip().split()
            # assert len(p)== s+1
            w = "".join(p[0])
            i2w[w] = float(p[1])
    print("Load tfidf from file %s: %d vocabulary" % (fname, len(i2w)))
    return i2w

idf_file = "extracted_data/idf.txt"
idf2w = load_idf(idf_file)

Load tfidf from file extracted_data/idf.txt: 408120 vocabulary


In [27]:
import operator
sorted_by_value = sorted(idf2w.items(), key=lambda kv: kv[1])
sorted_by_value[-100:]

[('bankrupts', 14.991641),
 ('oooooooookay', 14.991641),
 ('lalalooove', 14.991641),
 ('dunaway', 14.991641),
 ('aldrichs', 14.991641),
 ('veganites', 14.991641),
 ('softtalkers', 14.991641),
 ('bernhard', 14.991641),
 ('sppose', 14.991641),
 ('kickboxers', 14.991641),
 ('hazier', 14.991641),
 ('firpo', 14.991641),
 ('wadd', 14.991641),
 ('seekong', 14.991641),
 ('neighborhhods', 14.991641),
 ('lathed', 14.991641),
 ('texured', 14.991641),
 ('shadegg', 14.991641),
 ('fradaviolo', 14.991641),
 ('envisoned', 14.991641),
 ('bioswitch', 14.991641),
 ('unflanked', 14.991641),
 ('steamtrays', 14.991641),
 ('subsequentyly', 14.991641),
 ('chelseaphiles', 14.991641),
 ('guanoed', 14.991641),
 ('seftelian', 14.991641),
 ('lemongrasshopper', 14.991641),
 ('pittypats', 14.991641),
 ('multiculturalist', 14.991641),
 ('bajafresh', 14.991641),
 ('plumwine', 14.991641),
 ('shoftshell', 14.991641),
 ('shimmi', 14.991641),
 ('myrself', 14.991641),
 ('bestwingsintheworld', 14.991641),
 ('untll', 14.9916