In [2]:
import numpy as np
import pandas as pd
import scipy
import json
import pickle
from wikipedia2vec import Wikipedia2Vec
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [3]:
flatten = lambda x: [i for list_ in x for i in list_]

def get_vectors(words, lang='ja', return_words=False):
    if return_words is True:
        return words, np.array([w2v(w.lower(), lang=lang) for w in words])
    
    return np.array([w2v(w.lower(), lang=lang) for w in words])
        
def w2v(w, lang, embedding_dim=300):
    try:
        if lang == 'ja':
            return ja_w2v.get_word_vector(w).tolist()
        elif lang == 'en':
            return transformer.predict([en_w2v.get_word_vector(w)])[0]
        else:
            print("Undefined language.")
            return [0.0] * embedding_dim
    
    except KeyError:
        return [0.0] * embedding_dim

In [4]:
# load models
en_w2v = Wikipedia2Vec.load("../model/enwiki_20180420_300d.pkl")
ja_w2v = Wikipedia2Vec.load("../model/jawiki_20180420_300d.pkl")

with open("../model/wikipedia2vec_en2ja_mapping.pkl", 'rb') as f:
    transformer = pickle.load(f)

In [5]:
# load data & vectorize
methods_df = pd.read_pickle("../data/manufacturing_words_using_wikidata.pkl")

wiki_df = pd.read_csv("../data/train_split_words.csv")
wiki_df = wiki_df.append(pd.read_csv("../data/valid_split_words.csv"))
wiki_df._id = wiki_df._id.astype(str)

wiki_df = \
wiki_df.assign(
    wiki_wv = wiki_df.words.apply(lambda x: get_vectors(eval(x), lang='ja'))
).reset_index(drop=True)

In [6]:
# caluculate TF-IDF
docs = methods_df.groupby('_id').apply(lambda x: x.words.sum())
dct = Dictionary(docs)
corpus = [dct.doc2bow(article) for article in docs]
model = TfidfModel(corpus)
print("Documents:", len(corpus))

Documents: 1560


In [13]:
TFIDF_df = pd.DataFrame()
for i, (_id, value) in enumerate(methods_df.groupby('_id')):
    n_token = len(model[corpus[i]])
    token_scores = np.array([[dct[token_id], tfidf] for token_id, tfidf in model[corpus[i]]])
    tokens, scores = token_scores[:, 0], token_scores[:, 1]
    
    TFIDF_df = TFIDF_df.append(pd.DataFrame({'_id': [_id] * n_token, 'token': tokens, 'TFIDF': scores}))

In [None]:
TFIDF_top5_df = TFIDF_df.groupby('_id').apply(lambda x: x.sort_values('TFIDF', ascending=False)[:5]).reset_index(drop=True)

In [19]:
method_word_vec = {}
for _id, group in TFIDF_top5_df.groupby('_id'):
    words = group.token.tolist()
    method_word_vec[str(_id)] = get_vectors(words, lang='en')

In [44]:
def most_similarly(doc, kb, tfidf=None):
    return [_most_similarly(s, kb, tfidf) for s in doc]

def _most_similarly(s, kb, tfidf=None):
    mat_sim = 1.0 - scipy.spatial.distance.cdist(s, kb, 'cosine')
    mat_sim[np.isnan(mat_sim)] = 0.0
    most_sim = np.min(mat_sim, axis=1)
    
    return most_sim

def visualizer(title, doc, labels, mat_sim, fp):
    fp.write("<h2>{title}</h2><br>\n".format(**locals()))
    for s, label, a_sim in zip(doc, labels, mat_sim):
        a_sim[a_sim < 0.5] = 0.0
        for w, sim in zip(s, a_sim):
            alpha = sim
            if label:
                fp.write("<b><span style=\"background-color: rgba(255,0,0,{alpha})\">{w}</span></b> ".format(**locals()))
            else:
                fp.write("<font color=gray><span style=\"background-color: rgba(255,0,0,{alpha})\">{w}</span></font> ".format(**locals()))
        fp.write('<br>\n')
    fp.write('<br>\n')

In [46]:
fp = open("../dump/visualize_using_TFIDF.html", 'w', encoding='utf-8')

for _id, group in wiki_df.groupby('_id'):
    if method_word_vec.get(_id) is None:
        method_wvs = np.zeros((1, 300))
    else: 
        method_wvs = method_word_vec.get(_id)
    
    wiki_wvs = group.wiki_wv.values
    raw_doc = group.words.apply(lambda x: eval(x)).values
    title = group.title.values[0]
    labels = group.label.values
    mat_most_sim = most_similarly(wiki_wvs, method_wvs)
    visualizer(title, raw_doc, labels, mat_most_sim, fp)
    
fp.close()