In [2]:
import os
import numpy as np
import pandas as pd
import sys
import heapq

In [3]:
def ugly_normalize(vecs):
   normalizers = np.sqrt((vecs * vecs).sum(axis=1))
   normalizers[normalizers==0]=1
   return (vecs.T / normalizers).T

class Embeddings:
    def __init__(self, vecsfile, vocabfile=None, normalize=True):
        if vocabfile is None: vocabfile = vecsfile.replace("npy","vocab")
        self._vecs = np.load(vecsfile)
        self._vocab = open(vocabfile).read().split()
        if normalize:
           self._vecs = ugly_normalize(self._vecs)
        self._w2v = {w:i for i,w in enumerate(self._vocab)}
        self._df_shoptag = pd.read_csv('./data/shopdata_geohash.csv', sep = '\t')

    @classmethod
    def load(cls, vecsfile, vocabfile=None):
        return Embeddings(vecsfile, vocabfile)

    def word2vec(self, w):
        return self._vecs[self._w2v[w]]

    def similar_to_vec(self, v, N=10):
        sims = self._vecs.dot(v)
        sims = heapq.nlargest(N, zip(sims,self._vocab,self._vecs))
        return sims

    def most_similar(self, word, N=10):
        w = self._vocab.index(word)
        sims = self._vecs.dot(self._vecs[w])
        sims = heapq.nlargest(N, zip(sims,self._vocab))
        df_sims = pd.DataFrame([[i[0], int(i[1])] for i in sims], columns = ["sims", "shopid"])
        result = pd.merge(df_sims, self._df_shoptag, on=["shopid"], how='left')
        return result

In [4]:
vecs_path = ['./data/geohash_0//vecs.npy']
vecs_path.extend(['./data/geohash_'+str(i) + '/vecs.npy' for i in range(5,10)])
eb_list = [Embeddings.load(path) for path in vecs_path]

In [5]:
def evaluate(e, code, N=10):
    return e.most_similar(code, N+1)[['sims', 'shopid', 'product_tags', 'label', 'geohash_5', 'geohash_6', 
                                      'geohash_7', 'geohash_8', 'geohash_9']]

def precision(e, df, K = 10):
    total = len(df) * K
    hit = 0
    n = 0
    for id in df['shopid']:
        df_eva = evaluate(e, str(id), K)[['shopid', 'label']]
        acc = df_eva[(df_eva.shopid == id)]['label'][0]
        df_group = df_eva.groupby(['label']).size().reset_index(name='counts')
        hit += (df_group[df_group['label'] == acc].iloc[0,1] - 1)
        n += 1
        if n % 100 == 0:
            print("\r calculated: %d " % n, end="")
    print('\n hit = %d' % hit)
    print(' total = %d' % total)
    print(' precision = %.2f' % (hit/total*100))

In [6]:
df_wv = pd.read_csv('./data/geohash_0/wv', sep = ' ', header = 0, names = ['shopid', 'count'])

In [None]:
for i in range(6):
    print(vecs_path[i], 10)
    precision(eb_list[i], df_wv, 10)
    print('------------------------')

./data/geohash_0//vecs.npy 10
 calculated: 400 

In [None]:
for i in range(6):
    print(vecs_path[i], 20)
    precision(eb_list[i], df_wv, 20)
    print('------------------------')

In [None]:
for i in range(6):
    print(vecs_path[i])
    precision(eb_list[i], df_wv, 50)
    print('------------------------')