In [70]:
import numpy as np
import pandas as pd
import scipy
import json
import pickle
from wikipedia2vec import Wikipedia2Vec

In [3]:
# load models

en_w2v = Wikipedia2Vec.load("../model/enwiki_20180420_300d.pkl")
ja_w2v = Wikipedia2Vec.load("../model/jawiki_20180420_300d.pkl")

with open("../model/wikipedia2vec_en2ja_mapping.pkl", 'rb') as f:
    transformer = pickle.load(f)

In [11]:
flatten = lambda x: [i for list_ in x for i in list_]

def get_vectors(words, lang='ja'):
    return np.array([w2v(w.lower(), lang=lang) for w in words])
        
def w2v(w, lang, embedding_dim=300):
    try:
        if lang == 'ja':
            return ja_w2v.get_word_vector(w).tolist()
        elif lang == 'en':
            return transformer.predict([en_w2v.get_word_vector(w)])[0]
        else:
            print("Undefined language.")
            return [0.0] * embedding_dim
    
    except KeyError:
        return [0.0] * embedding_dim

In [5]:
methods_df = pd.read_csv("../data/manufacturing_words.csv")

In [16]:
method_word_vec = {}
for _id, group in methods_df.groupby('_id'):
    words = list(set(flatten(group.manufacturing_words.apply(lambda x: eval(x)).tolist())))
    method_word_vec[str(_id)] = get_vectors(words, lang='en')

In [128]:
method_words = {}
for _id, group in methods_df.groupby('_id'):
    method_words[str(_id)] = list(set(flatten(group.manufacturing_words.apply(lambda x: eval(x)).tolist())))

In [32]:
wiki_df = pd.read_csv("../data/train_split_words.csv")
wiki_df = wiki_df.append(pd.read_csv("../data/valid_split_words.csv"))
wiki_df._id = wiki_df._id.astype(str)

In [33]:
wiki_df = \
wiki_df.assign(
    wiki_wv = wiki_df.words.apply(lambda x: get_vectors(eval(x), lang='ja'))
).reset_index(drop=True)

In [248]:
def most_similarly(doc, kb):
    return [_most_similarly(s, kb) for s in doc]

def _most_similarly(s, kb):
    mat_sim = scipy.spatial.distance.cdist(s, kb, 'cosine')
    mat_sim[np.isnan(mat_sim)] = 1.0
    most_sim = np.min(mat_sim, axis=1)
    
    return most_sim

def visualizer(title, doc, labels, mat_sim, fp):
    fp.write("<h2>{title}</h2><br>\n".format(**locals()))
    for s, label, a_sim in zip(doc, labels, mat_sim):
        for w, sim in zip(s, a_sim):
            alpha = 1.0 - sim -0.5
            if label:
                fp.write("<b><span style=\"background-color: rgba(255,0,0,{alpha})\">{w}</span></b> ".format(**locals()))
            else:
                fp.write("<font color=gray><span style=\"background-color: rgba(255,0,0,{alpha})\">{w}</span></font> ".format(**locals()))
        fp.write('<br>\n')
    fp.write('<br>\n')

In [249]:
fp = open("../dump/visualize.html", 'w', encoding='utf-8', )

for _id, group in wiki_df.groupby('_id'):
    if method_word_vec.get(_id) is None:
        p_wv = [[0.0] * 300]
    else: 
        p_wv = method_word_vec.get(_id)
    
    doc = group.wiki_wv.values
    raw_doc = group.words.apply(lambda x: eval(x)).values
    title = group.title.values[0]
    labels = group.label.values
    mat_most_sim = most_similarly(doc, p_wv)
    visualizer(title, raw_doc, labels, mat_most_sim, fp)
    
fp.close()