In [1]:
import numpy as np

def load_embs(fname):
    i2w = dict()
    embs = []
    s = 0
    V = 0
    with open(fname, 'r') as f:
        for line in f:
            p = line.strip().split()
            if len(p) == 2:
                V = int(p[0])  # Vocabulary
                s = int(p[1])  # embeddings size
            else:
                # assert len(p)== s+1
                w = "".join(p[0])
                e = [float(i) for i in p[1:]]
                if sum(e) == 0:
                    continue
                i2w[len(i2w)] = w
                embs.append(e)
    print("Load pre-trained word vectors from file %s:" % fname)
    print("\t%d vocabulary, %d dimension" % (len(i2w), s))
    return i2w, np.array(embs)

In [2]:
emb_file = "extracted_data/word_emb.txt"
i2w, embs = load_embs(emb_file)

Load pre-trained word vectors from file extracted_data/word_emb.txt:
	74357 vocabulary, 100 dimension


In [3]:
# Kmean sklearn 
from sklearn import cluster
NUM_CLUSTERS = 10
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(embs)
centroids = kmeans.cluster_centers_
print ("Centroids data")
print (centroids)

Centroids data
[[-7.75369912e-01 -3.92898376e-01 -6.95324714e-01 -1.48136649e+00
   8.14631409e-01  1.56629229e-01 -8.16234236e-02 -5.30130919e-01
   1.83198397e-01  9.77367223e-01  5.84722374e-01  1.07992957e+00
   1.04397729e+00 -3.98138278e-01  3.25519428e-01  8.61297762e-03
  -2.23373575e+00 -2.86637066e-01  1.25375184e+00 -2.83829121e-01
  -4.43563139e-01 -1.92911975e-01  5.31355156e-01  5.42422788e-01
  -5.82493948e-01  6.55893426e-01 -1.69830276e-01 -4.86491541e-02
   1.14308497e+00 -1.66375610e-01  5.29579224e-01  1.01465381e+00
  -1.24697264e+00  1.70431163e+00  4.69809545e-02  1.07752417e+00
  -2.21154267e-01  1.16900708e+00 -6.43362118e-01 -4.20988776e-01
  -1.97409619e-01  4.54881033e-01  2.87102533e-01 -8.17907868e-01
   1.76698290e-01  2.59371150e-01 -1.14025810e+00 -2.88853290e-01
   2.07304850e-01  1.38468031e+00  6.10079612e-01  1.99108695e-01
  -3.89555781e-01 -3.57251247e-01  2.00977032e+00 -1.93239403e+00
   1.18293103e+00 -3.49974683e-01 -6.69649876e-01 -2.90541083

In [4]:
def cosine_distance(W, ivocab, topic_embs, N=50):
    # normalize each word vector to unit variance
    d = (np.sum(W ** 2, 1) ** 0.5)
    # vec(a) <--- vec(a)/||vec(a)||
    W_norm = (W.T / d).T
    i = 0
    for topic in topic_embs:
        i += 1
        vec_result = np.array(topic)
        d = (np.sum(vec_result ** 2,) ** 0.5)
        # vec(b) <--- vec(b)/||vec(b)||
        vec_norm = (vec_result.T / d).T
        dist = np.dot(W_norm, vec_norm.T)
        a = np.argsort(-dist)[:N]
        neighbors = " ".join(["%s (%.4f);" % (ivocab[x], dist[x]) for x in a])
        print("topic_%d: %s\n" % (i, neighbors))

In [5]:
cosine_distance(embs, i2w, centroids, N=100)

topic_1: melrose (0.7741); remedys (0.7677); edmonton (0.7622); pinkys (0.7586); boulders (0.7577); fitzgeralds (0.7553); creekside (0.7513); courthouse (0.7475); hamilton (0.7467); northside (0.7433); hawthorne (0.7416); atrias (0.7392); cg (0.7384); parkers (0.7383); jts (0.7376); crossroads (0.7361); harrys (0.7356); georgetown (0.7350); chuys (0.7347); coronado (0.7328); hodges (0.7313); bambinos (0.7306); riverside (0.7305); huntridge (0.7296); redbeards (0.7284); rivergate (0.7273); borgata (0.7270); oshawa (0.7258); symposium (0.7246); dillons (0.7242); jbs (0.7240); providence (0.7240); lgo (0.7240); sammys (0.7237); oakwood (0.7227); euclid (0.7222); cityscape (0.7222); vincents (0.7213); asheville (0.7208); bootleggers (0.7198); acadia (0.7197); somerset (0.7194); waterfront (0.7192); gvr (0.7189); kincaids (0.7184); steiners (0.7176); carnegie (0.7170); cp (0.7167); gamekeepers (0.7163); gramercy (0.7161); biltmore (0.7152); tjs (0.7129); fairview (0.7126); vig (0.7125); hen

In [12]:
# source: https://www.ranks.nl/stopwords
sws = "a able about above abst accordance according accordingly across act actually added adj affected affecting " \
      "affects after afterwards again against ah all almost alone along already also although always am among " \
      "amongst an and announce another any anybody anyhow anymore anyone anything anyway anyways anywhere " \
      "apparently approximately are aren arent arise around as aside ask asking at auth available away awfully b " \
      "back be became because become becomes becoming been before beforehand begin beginning beginnings begins " \
      "behind being believe below beside besides between beyond biol both brief briefly but by c ca came can cannot " \
      "can't cause causes certain certainly co com come comes contain containing contains could couldnt d date did " \
      "didn't different do does doesn't doing done don't down downwards due during e each ed edu effect eg eight " \
      "eighty either else elsewhere end ending enough especially et et-al etc even ever every everybody everyone " \
      "everything everywhere ex except f far few ff fifth first five fix followed following follows for former " \
      "formerly forth found four from further furthermore g gave get gets getting give given gives giving go goes " \
      "gone got gotten h had happens hardly has hasn't have haven't having he hed hence her here hereafter hereby " \
      "herein heres hereupon hers herself hes hi hid him himself his hither home how howbeit however hundred i id ie " \
      "if i'll im immediate immediately importance important in inc indeed index information instead into invention " \
      "inward is isn't it itd it'll its itself i've j just k keep keeps kept kg km know known knows l largely last " \
      "lately later latter latterly least less lest let lets like liked likely line little 'll look looking looks " \
      "ltd m made mainly make makes many may maybe me mean means meantime meanwhile merely mg might million miss " \
      "ml more moreover most mostly mr mrs much mug must my myself n na name namely nay nd near nearly necessarily " \
      "necessary need needs neither never nevertheless new next nine ninety no nobody non none nonetheless noone " \
      "nor normally nos not noted nothing now nowhere o obtain obtained obviously of off often oh ok okay old " \
      "omitted on once one ones only onto or ord other others otherwise ought our ours ourselves out outside " \
      "over overall owing own p page pages part particular particularly past per perhaps placed please plus " \
      "poorly possible possibly potentially pp predominantly present previously primarily probably promptly proud " \
      "provides put q que quickly quite qv r ran rather rd re readily really recent recently ref refs regarding " \
      "regardless regards related relatively research respectively resulted resulting results right run s said " \
      "same saw say saying says sec section see seeing seem seemed seeming seems seen self selves sent seven " \
      "several shall she shed she'll shes should shouldn't show showed shown showns shows significant " \
      "significantly similar similarly since six slightly so some somebody somehow someone somethan something " \
      "sometime sometimes somewhat somewhere soon sorry specifically specified specify specifying still stop " \
      "strongly sub substantially successfully such sufficiently suggest sup sure t take taken taking tell tends " \
      "th than thank thanks thanx that that'll thats that've the their theirs them themselves then thence there " \
      "thereafter thereby thered therefore therein there'll thereof therere theres thereto thereupon there've " \
      "these they theyd they'll theyre they've think this those thou though thoughh thousand throug through " \
      "throughout thru thus til tip to together too took toward towards tried tries truly try trying ts twice " \
      "two u un under unfortunately unless unlike unlikely until unto up upon ups us use used useful usefully " \
      "usefulness uses using usually v value various 've very via viz vol vols vs w want wants was wasnt way we " \
      "wed welcome we'll went were werent we've what whatever what'll whats when whence whenever where whereafter " \
      "whereas whereby wherein wheres whereupon wherever whether which while whim whither who whod whoever " \
      "whole who'll whom whomever whos whose why widely willing wish with within without wont words world " \
      "would wouldnt www x y yes yet you youd you'll your youre yours yourself yourselves you've z zero"
sw_list = set(sws.split())

In [22]:
def load_idf(fname):
    i2w = dict()

    with open(fname, 'r') as f:
        p = next(f).split()
        for line in f:
            p = line.strip().split()
            # assert len(p)== s+1
            w = "".join(p[0])
            i2w[w] = float(p[1])
    print("Load tfidf from file %s: %d vocabulary" % (fname, len(i2w)))
    return i2w

idf_file = "extracted_data/idf.txt"
idf2w = load_idf(idf_file)

Load tfidf from file extracted_data/idf.txt: 408120 vocabulary


In [27]:
import operator
sorted_by_value = sorted(idf2w.items(), key=lambda kv: kv[1])
sorted_by_value[-100:]

[('bankrupts', 14.991641),
 ('oooooooookay', 14.991641),
 ('lalalooove', 14.991641),
 ('dunaway', 14.991641),
 ('aldrichs', 14.991641),
 ('veganites', 14.991641),
 ('softtalkers', 14.991641),
 ('bernhard', 14.991641),
 ('sppose', 14.991641),
 ('kickboxers', 14.991641),
 ('hazier', 14.991641),
 ('firpo', 14.991641),
 ('wadd', 14.991641),
 ('seekong', 14.991641),
 ('neighborhhods', 14.991641),
 ('lathed', 14.991641),
 ('texured', 14.991641),
 ('shadegg', 14.991641),
 ('fradaviolo', 14.991641),
 ('envisoned', 14.991641),
 ('bioswitch', 14.991641),
 ('unflanked', 14.991641),
 ('steamtrays', 14.991641),
 ('subsequentyly', 14.991641),
 ('chelseaphiles', 14.991641),
 ('guanoed', 14.991641),
 ('seftelian', 14.991641),
 ('lemongrasshopper', 14.991641),
 ('pittypats', 14.991641),
 ('multiculturalist', 14.991641),
 ('bajafresh', 14.991641),
 ('plumwine', 14.991641),
 ('shoftshell', 14.991641),
 ('shimmi', 14.991641),
 ('myrself', 14.991641),
 ('bestwingsintheworld', 14.991641),
 ('untll', 14.9916