In [1]:
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pickle
from bs4 import BeautifulSoup
import re

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import torch
import torch.nn as nn
from torch.utils import data
from torch import optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import  pack_padded_sequence, pad_packed_sequence

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import random

In [3]:
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge import Rouge 
rouge = Rouge()

In [4]:
# with open("product_desc_dict.pickle", 'rb') as f:
#     prod_desc = pickle.load(f)
# with open("product_intro_dict.pickle", 'rb') as f:
#     prod_intro = pickle.load(f)
# with open("product_name_dict.pickle", 'rb') as f:
#     prod_name = pickle.load(f)

In [5]:
# with open('com_product_intro_word_articles_train.json', 'r') as f:
#     intro = json.load(f)
# with open('com_product_name_word_flatten_intro_train.json', 'r') as f:
#     name = json.load(f)

In [6]:
# def filter_tags(htmlstr):
#     re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I)
#     re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)
#     re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)
#     re_br=re.compile('<br\s*?/?>')
#     re_h=re.compile('</?\w+[^>]*>')
#     re_comment=re.compile('<!--[^>]*-->')
#     s=re_cdata.sub('',htmlstr)
#     s=re_script.sub('',s)
#     s=re_style.sub('',s)
#     s=re_br.sub('',s)
#     s=re_h.sub('',s) 
#     s=re_comment.sub('',s)
#     blank_line=re.compile('\n+')
#     s=blank_line.sub('\n',s)
#     s=replaceCharEntity(s)
#     return s

# def replaceCharEntity(htmlstr):
#     CHAR_ENTITIES={'nbsp':' ','160':' ',
#                 'lt':'<','60':'<',
#                 'gt':'>','62':'>',
#                 'amp':'&','38':'&',
#                 'quot':'"','34':'"',}
#     re_charEntity=re.compile(r'&#?(?P<name>\w+);')
#     sz=re_charEntity.search(htmlstr)
#     while sz:
#         entity=sz.group()
#         key=sz.group('name')
#         try:
#             htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1)
#             sz=re_charEntity.search(htmlstr)
#         except KeyError:
#             htmlstr=re_charEntity.sub('',htmlstr,1)
#             sz=re_charEntity.search(htmlstr)
#     return htmlstr

In [7]:
# def data_cleaner(data) -> dict:
#     new_data = {}
#     for key in data:
#         new_value = filter_tags(data[key])
#         new_data[key] = new_value
#     return new_data

In [8]:
# prod_desc = data_cleaner(prod_desc)
# prod_intro = data_cleaner(prod_intro)
# prod_name = data_cleaner(prod_name)

In [9]:
# with open("clear_product_desc_dict.pickle", 'wb') as f:
#     pickle.dump(prod_desc, f)
# with open("clear_product_intro_dict.pickle", 'wb') as f:
#     pickle.dump(prod_intro, f)
# with open("clear_product_name_dict.pickle", 'wb') as f:
#     pickle.dump(prod_name, f)

In [10]:
# with open("seg_desc.pickle", 'rb') as f:
#     seg_desc = pickle.load(f)
# with open("seg_intro.pickle", 'rb') as f:
#     seg_intro = pickle.load(f)
# with open("seg_name.pickle", 'rb') as f:
#     seg_name = pickle.load(f)

In [11]:
# seg_desc

In [12]:
doc2Vec_model = gensim.models.Doc2Vec.load("./doc2vec_model/doc2vec.model")

In [13]:
def load_json(file):
    with open(file) as jsonfile:
        data = json.load(jsonfile)
    return data

In [14]:
class Embeddings(torch.nn.Module): 
    def __init__(self, w2vmodel): 
        super().__init__() 
        self.weights = torch.FloatTensor(w2vmodel.wv.vectors)
        self.embedding = nn.Embedding.from_pretrained(self.weights, padding_idx = -1)
        self.embedding.requires_grad = False
        
        # vector for oov 
        self.oov = torch.nn.Parameter(data=torch.rand(1,250)) 
        self.oov_index = -1 
        self.dim = 250 

    def create_embedding(self, arr): 
        N = arr.shape[0] 
        mask =  (arr==self.oov_index).long() 
        mask_ = mask.unsqueeze(dim=1).float() 
        embed =(1-mask_)*self.embedding((1-mask)*arr) + mask_*(self.oov.expand((N,self.dim))) 
        return embed

In [15]:
w2vmodel = gensim.models.Word2Vec.load('./word2vec/word2vec.model')
embedding_model = Embeddings(w2vmodel)

In [16]:
def LoadDocSentEmb(doc_list, doc_idx, sent_num):
    """ Return the embedding of each sentence of a document. If sentence number is less than sent_num, the last part will
        be filled by 0.
    parameter:
        doc_list: every product intro
        doc_idx : load which product intro
    return: sentence embedding array
    """
    doc_sent = doc_list[doc_idx]
    sent_emb_array = []
    
    for i in range(sent_num):
        id_seq = []
        if i < len(doc_sent):
            sent = doc_sent[i].split()
            for word in sent:
                try:
                    word_id = w2vmodel.wv.vocab[word].index
                except KeyError:
                    word_id = -1  # deal with OOV
                finally:
                    id_seq.append(word_id)
            sent_emb = embedding_model.create_embedding(torch.tensor(id_seq))
        else:
            sent_emb = torch.FloatTensor([np.zeros((250,))])
        sent_emb_array.append(sent_emb.view(len(sent_emb), 1, 250))
    
    return sent_emb_array

In [17]:
class GraphLoader(data.Dataset):
    def __init__(self, mode, root, sent_num=20, sent_len=30, hidden_size=250):
        self.root = root
        self.inputdata = load_json(root+'com_product_intro_word_articles_'+mode+'.json')
        self.golddata = load_json(root+'com_product_name_word_flatten_intro_'+mode+'.json')
        self.golddata = self.trans_data(self.golddata)
        self.sent_len = sent_len
        self.sent_num = sent_num
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.lstm = nn.LSTM(input_size=250, hidden_size=250, batch_first=True)
        
    def __len__(self):
        """return the size of dataset"""
        return len(self.inputdata)
    
    def __getitem__(self, index):
        # training
        graph = self.create_graph(index) 
        embedding = self.create_lstm_embedding(self.inputdata, index)
        
        # ground truth
        gold = self.compute_rouge(self.inputdata,self.golddata, index)
        gold = torch.FloatTensor(gold)
        
        return graph, embedding, gold
    
    def create_graph(self, idx):
        """Using doc2vec cosine similarity"""
        sentences_list = self.inputdata[idx]
        sentence_vectors = []
        for i in range(self.sent_num):
            if i < len(sentences_list):
                v = doc2Vec_model.infer_vector(sentences_list[i].split())
            else:
                v = np.zeros((250,)) 
            sentence_vectors.append(v)
        sentence_similarity_martix = np.eye(self.sent_num)
        for j in range(self.sent_num):
            for k in range(self.sent_num):
                if j != k:
                    sentence_similarity_martix[j][k] = cosine_similarity(sentence_vectors[j].reshape(1,250), sentence_vectors[k].reshape(1,250))[0,0]

#         sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
        return sentence_similarity_martix
#     def create_graph(self, idx):
#         """Using word2vec cosine smilarity"""
#         sentences_list = self.inputdata[idx]
#         sentence_vectors = []
#         for i in range(self.sent_num):
#             if i < len(sentences_list):
#                 s = sentences_list[i]
#                 if len(s) != 0:
#                     v = sum([word_embeddings.get(w, np.zeros((250,))) for w in s])/(len(s)+0.001)
#                 else:
#                     v = np.zeros((250,))
#             else:
#                 v = np.zeros((250,)) 
#             sentence_vectors.append(v)
#         sentence_similarity_martix = np.eye(self.sent_num)
#         for j in range(self.sent_num):
#             for k in range(self.sent_num):
#                 if j != k:
#                     sentence_similarity_martix[j][k] = cosine_similarity(sentence_vectors[j].reshape(1,250), sentence_vectors[k].reshape(1,250))[0,0]

# #         sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
#         return sentence_similarity_martix
    
    def create_lstm_embedding(self, data, idx):
        doc_emb_array = LoadDocSentEmb(data, idx, self.sent_num)
        hidden = (torch.randn(1, 1, 250), torch.randn(1, 1, 250))
        doc_hidden = []
        for i in doc_emb_array:
            for j in i:
                out, hidden = self.lstm(j.view(1, 1, -1), hidden)
            doc_hidden.append(hidden[0])
        return doc_hidden
            
    def create_gru_embedding(self, data, idx):
        doc_emb_array = LoadDocSentEmb(data, idx, self.sent_num)
        hidden = torch.randn(1, 1, 250)
        doc_hidden = []
        for i in doc_emb_array:
            for j in i:
                out, hidden = self.gru(j.view(1, 1, -1), hidden)
            doc_hidden.append(hidden[0])
        return doc_hidden
    
    def compute_rouge(self, input, gold, idx):
        score_list = []
        for sentence in input[idx]:
            score = rouge.get_scores(gold[idx], [sentence])
            score_list.append([score[0]['rouge-1']['r']])
            if len(score_list) == self.sent_num:
                break
        if len(score_list) < self.sent_num:
            while len(score_list) != self.sent_num:
                score_list.append([0.0])
        return score_list
    
    def trans_data(self, data):
        trans = []
        for sentences in data:
            s_str = ""
            for w in sentences:
                s_str += w +' '
            trans.append([s_str])
        return trans

In [18]:
A = GraphLoader('train','./').__getitem__(0)

In [19]:
A[0]

array([[ 1.        ,  0.28785527,  0.19549409,  0.44937891,  0.41897935,
         0.30145866,  0.51115197, -0.06817409,  0.41364396,  0.07488795,
         0.32580221,  0.18923962,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.28785527,  1.        ,  0.30073968,  0.39978355,  0.32332748,
         0.36612588,  0.49933702, -0.13397253,  0.44358569,  0.05810773,
         0.27528608,  0.09012038,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.19549409,  0.30073968,  1.        ,  0.37444597,  0.19789618,
         0.36755693,  0.29125834,  0.18727028,  0.3235667 ,  0.04949016,
         0.13395831,  0.1334742 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.44937891,  0.39978355,  0.37444597,  1.        ,  0.49535012,
         0.38115534,  0.57205284, -0.10435088,  

In [20]:
A[1]

[tensor([[[-2.3633e-02, -2.3752e-01, -1.5863e-01,  2.2650e-01,  1.1861e-01,
            9.5543e-02, -3.0530e-01,  1.3300e-01, -2.6690e-01,  1.5606e-01,
            3.1622e-01,  2.1206e-01, -1.2966e-02,  2.5236e-02, -1.2698e-01,
           -1.8631e-01,  8.1974e-02, -3.7372e-01,  6.3274e-02,  2.7741e-01,
            1.6722e-01, -5.1829e-03,  1.7185e-01,  2.1024e-01, -3.2208e-01,
            3.2513e-02, -4.1368e-02,  9.3692e-02, -4.7133e-02, -1.2692e-01,
           -6.1206e-02, -2.5556e-01, -3.6455e-02,  6.4819e-02,  5.9680e-03,
            1.6167e-01,  1.3276e-01, -1.2413e-01, -7.9466e-02,  2.7083e-01,
            1.4911e-01,  2.2303e-01,  6.6630e-02,  3.1581e-02, -1.2163e-01,
            1.4637e-01,  1.5481e-01, -3.8461e-02, -2.8159e-02,  1.2362e-02,
            6.8382e-02,  1.1791e-01,  2.4305e-01, -1.0490e-01, -2.7647e-02,
            2.4406e-01, -1.6569e-01,  2.9942e-02, -1.0270e-01,  1.6917e-01,
            4.6042e-02,  1.0967e-01,  3.5491e-01,  1.2662e-01,  7.2565e-02,
           -

In [21]:
A[2]

tensor([[0.0476],
        [0.1000],
        [0.1538],
        [0.1852],
        [0.1860],
        [0.2632],
        [0.4667],
        [0.0588],
        [0.3750],
        [0.4000],
        [0.4054],
        [0.2093],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000]])