In [None]:
import numpy as np
import random
from random import shuffle
from numpy import array
import math
import torch
import torch.optim as optim
import torch.nn as nn
from torch.autograd import Variable
import nltk
#nltk.download('stopwords')
import time
import pickle

torch.cuda.set_device(3) 


In [None]:
print(torch.cuda.current_device())

In [None]:
random.seed(1)
torch.manual_seed(1)
np.random.seed(1)

In [None]:
class NodeInputData:

    def __init__(self, batch_size=16):

        with open('data/graph_embd_yoga/data_des_net.pickle', 'rb') as in_file:
            [self.graph, self.id2name, self.name2id, self.type_start, self.type_end, \
             self.user_start, self.user_end, self.des_start, self.des_end, \
             self.annotated_des, self.annotated_user, \
             self.twt2tokenized_text, self.weights_matrix, self.name2text, self.gt_user]=pickle.load(in_file, encoding="bytes")
        
        ## get the indecies of stances, issues, ....etc from data.pickle file also in graph.mapping file
        self.all_types=set([i for i in range(self.type_start, self.type_end+1)])
        #print("self.all_types", self.all_types) #{0, 1, 2}
        self.all_users=set([i for i in range(self.user_start, self.user_end+1)])
        #print("self.all_users", len(self.all_users)) #13301
        self.all_des=set([i for i in range(self.des_start, self.des_end+1)])
        #print("self.all_des", len(self.all_des)) #13178
        #self.all_loc=set([i for i in range(self.loc_start, self.loc_end+1)])
        #print("self.all_loc", len(self.all_loc)) #7302
#         self.all_issue_bigrams=set([i for i in range(self.topic_bigram_start, self.topic_bigram_end+1)])
#         #print("self.all_issue_bigrams", len(self.all_issue_bigrams)) #{2000000, 2000001, 2000002,...}, len = 2039
      
#         self.all_indicator_labels=set([i for i in range(self.indicator_label_start, self.indicator_label_end+1)])
#         #print('self.all_indicator_labels', self.all_indicator_labels) #{10}
        
        ###for positive example
        self.user2type_adj_pos={}
        self.user2des_adj_pos={}
        self.user2loc_adj_pos={}
        self.des2type_adj_pos={}
        self.loc2type_adj_pos={}
        self.user2mention_adj_pos={}
        #self.issue2indicator_label_adj_pos={}
        
        ###for negative example
        self.user2type_adj_neg={}
        self.user2des_adj_neg={}
        self.user2loc_adj_neg={}
        self.des2type_adj_neg={}
        self.loc2type_adj_neg={}
        self.user2mention_adj_neg={}
        #self.issue2indicator_label_adj_neg = {}


        self.all_nodes_to_train=None

        self.batch_size=batch_size
        
    '''
    Get nodes from graph
    '''
    def get_nodes(self, training_graph):

        #print ("training_graph", len(training_graph)) #43301
        '''
        for each user we will create positive and negative examples
        '''
        for usr in self.all_users:
            #print(usr) #100000..
            adjacency_list=training_graph[usr]
            #print("adjacency_list", adjacency_list) #{1012088, 2002249}...
            '''
            pos/neg for user2type
            '''
            #print(adjacency_list & self.all_types, len(adjacency_list & self.all_types)) #set() len=0, {1} len = 1,... 
            if len(adjacency_list & self.all_types)>0:
                self.user2type_adj_pos[usr]=adjacency_list & self.all_types
                #print("pos", self.user2type_adj_pos[usr]) #{1}
                self.user2type_adj_neg[usr] = self.all_types - self.user2type_adj_pos[usr]
                #print("neg", self.user2type_adj_neg[usr]) #neg {0, 2}
            '''
            pos/neg for user2des
            '''
            ##each user connected to a description
            #print(adjacency_list & self.all_des, len(adjacency_list & self.all_des)) #{1002822} len =1,
            if len(adjacency_list & self.all_des)>0:
                self.user2des_adj_pos[usr]=adjacency_list & self.all_des
                #print('pos', self.user2des_adj_pos[usr])  #{1002822}
                self.user2des_adj_neg[usr] = self.all_des - self.user2des_adj_pos[usr]
                #print('neg', self.user2des_adj_neg[usr]) #rest of the funding entities are negative example for this particular ad
                #sys.exit()

            '''
            pos/neg for user2mention
            '''
            if len(adjacency_list & self.all_users)>0:
                self.user2mention_adj_pos[usr]=adjacency_list & self.all_users
                #print('pos', self.user2mention_adj_pos[usr]) #{2000448, 2001843, 2000180, 2000725, 2001366, 2000991}
                self.user2mention_adj_neg[usr] = self.all_users - self.user2mention_adj_pos[usr]
                #print('neg', self.user2mention_adj_neg[usr]) #rest of issue bigrams are negative example
        
        
        '''
        for each description we will create positive and negative examples
        '''        
        for des in self.all_des:
            adjacency_list=training_graph[des]
            '''
            pos/neg for des2type
            '''
            if len(adjacency_list & self.all_types)>0:
                self.des2type_adj_pos[des]=adjacency_list & self.all_types
                self.des2type_adj_neg[des] = self.all_types - self.des2type_adj_pos[des]
        
         

        self.all_nodes_to_train=[i for i in training_graph]

        batches=[]
        
        discarded_nodes=0
        shuffle(self.all_nodes_to_train)
        print ("all_nodes_to_train", len(self.all_nodes_to_train)) #33784
        j=0
        while j<len(self.all_nodes_to_train):
            batch_input = {'user2type': [], 'user2des': [],  'user2mention':[], \
                           'des2type': [] }
            ##batch for positive example
            batch_gold = {'user2type': [], 'user2des': [],  'user2mention':[],\
                           'des2type': [] }
            ##batch for positive example
            batch_neg = {'user2type': [], 'user2des': [], 'user2mention':[],\
                           'des2type': [] }

            text_nodes=[]
            
            if j+self.batch_size<=len(self.all_nodes_to_train)-1:
                nodes_to_train=self.all_nodes_to_train[j:j+self.batch_size]
            else:
                nodes_to_train = self.all_nodes_to_train[j:]

            if len(nodes_to_train)==0:
                continue
                
            for node in nodes_to_train:
                '''
                for each user to type/des/loc, we select positive and negative examples
                '''
                if node in self.all_users: 
                    text_nodes.append(node)
                    
                    '''
                    user2type
                    '''
                    if node in self.user2type_adj_pos:
                        #print ("node", node) #106497
                        #print(self.user2type_adj_pos[node], len(self.user2type_adj_pos[node])) #{0}, len=1
                        #print('prev b ', batch_input['ad2stance'])
                        batch_input['user2type']+=[node for i in range(0,len(self.user2type_adj_pos[node]))]
                        #print('later b ', batch_input['user2type']) #[106497]
                        #print('prev p', batch_gold['ad2stance'])
                        batch_gold['user2type']+=list(self.user2type_adj_pos[node])
                        #print('later p', batch_gold['ad2stance']) #[0], multi-label [0, 0, 3]
                        #print('prev n', batch_neg['ad2stance'])
                        #print('random ', random.sample(self.ad2stance_adj_neg[node], 2))
                        #print('len pos', len(self.user2type_adj_pos[node])) #1
                        ##randomly select two negative example for each positive example
                        batch_neg['user2type']+=[random.sample(self.user2type_adj_neg[node], 2) for i in range(0, len(self.user2type_adj_pos[node]))]
                        #print('later n', batch_neg['user2type']) #[[4, 1]], multilabel [[4, 1], [4, 2], [2, 1]]
                    
                    '''
                    user2des
                    '''
                    if node in self.user2des_adj_pos:
                        #print ('ad2fe', node)
                        batch_input['user2des']+=[node for i in range(0,len(self.user2des_adj_pos[node]))]
                        batch_gold['user2des']+=list(self.user2des_adj_pos[node])
                        batch_neg['user2des']+=[random.sample(self.user2des_adj_neg[node], 5) for i in range(0, len(self.user2des_adj_pos[node]))]


                    '''
                    user2mention
                    '''
                    if node in self.user2mention_adj_pos:
                        
                        batch_input['user2mention'] += [node for i in range(0, len(self.user2mention_adj_pos[node]))]
                        batch_gold['user2mention'] += list(self.user2mention_adj_pos[node])
                        batch_neg['user2mention'] += [random.sample(self.user2mention_adj_neg[node], 5) for i in range(0, len(self.user2mention_adj_pos[node]))]

                '''
                for each description to type, we select positive and negative examples
                '''
                if node in self.all_des:
                    if node in self.des2type_adj_pos:
                        #print ('fe2st', node)
                        batch_input['des2type']+=[node for i in range(0,len(self.des2type_adj_pos[node]))]
                        batch_gold['des2type']+=list(self.des2type_adj_pos[node])
                        batch_neg['des2type']+=[random.sample(self.des2type_adj_neg[node], 2) for i in range(0, len(self.des2type_adj_pos[node]))]

                        
            #print("j, self.batch_size", j, self.batch_size) #100
            for k in batch_input:
                #print("k", k) #ad2stance, ad2funding_entity
                batch_input[k] = array(batch_input[k])
                #print("batch_input[k]", batch_input[k]) #[104586 135113 119709 127739 130641 113722 122613 123000 123000 123074 100555 100555 100165 102666 135542 127619 118428]
            for k in batch_gold:
                batch_gold[k] = array([batch_gold[k]]).transpose()
                #print('batch_gold[k]', batch_gold[k])
            for k in batch_neg:
                batch_neg[k] = array(batch_neg[k])
                #print('batch_neg[k]', batch_neg[k])

            j=j+self.batch_size
            batches.append([batch_input, batch_gold, batch_neg, text_nodes])
            #print("batches", batches)
            
        
        self.batches=batches
        
        print ("Discarded: %d nodes."%(discarded_nodes))
    
    def save_embeddings(self, model):

        #output_dir='../../scratch/fb_ads_data/output/embeddings/'
        
        #f=open(output_dir+"stance.embeddings","w")
        f=open("data/graph_embd_yoga/output/des_net/utype.embeddings","w")
        utype_embd=model.utype_embeddings.weight.cpu().data.numpy()
        utype_embd=utype_embd.tolist()
        for i in range(0,len(utype_embd)):
            id=i+self.type_start
            name=self.id2name[id]
            f.write(str(name))
            embd=utype_embd[i]
            for j in range(len(embd)):
                if j==0:
                    f.write("\t"+str(embd[j]))
                else:
                    f.write(" "+str(embd[j]))
            f.write("\n")



        f = open("data/graph_embd_yoga/output/des_net/user.embeddings", "w")
        users = [k for k in range(self.user_start, self.user_end + 1)]
        for usr in users:
            embd = model.user_embeddings([usr])
            embd = embd[0]
            embd = embd.cpu().data.numpy()
            embd = embd.tolist()
            name = self.id2name[usr]
            f.write(str(name))
            for j in range(len(embd)):
                if j == 0:
                    f.write("\t" + str(embd[j]))
                else:
                    f.write(" " + str(embd[j]))
            f.write("\n")
            
        f = open("data/graph_embd_yoga/output/des_net/description.embeddings", "w")          
        des_embd = model.des_embeddings.weight.cpu().data.numpy()
        des_embd = des_embd.tolist()
        for i in range(0, len(des_embd)):
            id = i + self.des_start
            name = self.id2name[id]
            f.write(str(name))
            embd = des_embd[i]
            for j in range(len(embd)):
                if j == 0:
                    f.write("\t" + str(embd[j]))
                else:
                    f.write(" " + str(embd[j]))
            f.write("\n")
        
        f = open("data/graph_embd_yoga/output/des_net/network.embeddings", "w")          
        net_embd = model.net_embeddings.weight.cpu().data.numpy()
        net_embd = net_embd.tolist()
        for i in range(0, len(net_embd)):
            id = i + self.user_start
            name = self.id2name[id]
            f.write(str(name))
            embd = net_embd[i]
            for j in range(len(embd)):
                if j == 0:
                    f.write("\t" + str(embd[j]))
                else:
                    f.write(" " + str(embd[j]))
            f.write("\n")
        f.close()
        
        

In [None]:
###Bi-LSTM 
class BLSTM(nn.Module):
    def __init__(self, non_trainable=True, cuda_available=True):
        super(BLSTM, self).__init__()

        with open('data/graph_embd_yoga/data_des_net.pickle', 'rb') as in_file:
            [self.graph, self.id2name, self.name2id, self.type_start, self.type_end, \
             self.user_start, self.user_end, self.des_start, self.des_end, \
             self.annotated_des,  self.annotated_user, \
             self.twt2tokenized_text, self.weights_matrix, self.name2text, self.gt_user]=pickle.load(in_file, encoding="bytes")
        

        self.hidden_dim = 150
        num_embeddings = len(self.weights_matrix)
        embedding_dim = len(self.weights_matrix[0])
        self.cuda_available=cuda_available
        #if torch.cuda.is_available():
        if self.cuda_available:
            self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0).cuda()
        else:
            self.word_embeddings = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)

        self.word_embeddings.weight.data.copy_(torch.from_numpy(self.weights_matrix))

        if non_trainable:
            self.word_embeddings.weight.requires_grad = False

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        #if torch.cuda.is_available():
        if self.cuda_available:
            #self.lstm = nn.LSTM(300, self.hidden_dim, bidirectional=True, batch_first=True, dropout=0.5)
            self.lstm = nn.LSTM(300, self.hidden_dim, num_layers = 1, bidirectional=True, batch_first=True, dropout = 0).cuda()
            
        else:
            #self.lstm = nn.LSTM(300, self.hidden_dim, bidirectional=True, batch_first=True, dropout=0.5)
            self.lstm = nn.LSTM(300, self.hidden_dim, num_layers = 1, bidirectional=True, batch_first=True, dropout = 0)
        
    def init_hidden(self, batch_size):
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        #if torch.cuda.is_available():
        if self.cuda_available:
            return (torch.zeros(2, batch_size, self.hidden_dim).cuda(),
                    torch.zeros(2, batch_size, self.hidden_dim).cuda())
        return (torch.zeros(2, batch_size, self.hidden_dim),
                torch.zeros(2, batch_size, self.hidden_dim))
    

    ###to get rid of empty X
    def get_padded(self, X):
        #print ("X", X)
        X_lengths = []
        new_X = []
        for sentence in X:
            #print('blstm', sentence)
            if len(sentence) > 0: 
                X_lengths.append(len(sentence)) 
                new_X.append(sentence)
        
        pad_token = 0
        longest_sent = max(X_lengths)
        batch_size = len(new_X)
        padded_X = np.ones((batch_size, longest_sent)) * pad_token

        for i, x_len in enumerate(X_lengths):
            sequence = new_X[i]
            padded_X[i, 0:x_len] = sequence[:x_len]
        #print ("padded_X", padded_X)
        return padded_X, X_lengths
    

    def forward(self, X):
        #print ("inside forward Bi-LSTM ", X, len(X)) #[122211, 105694, 128324, 123265, 123692, 110528, 104453..
        # ---------------------
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len, 1) -> (batch_size, seq_len, embedding_dim)
        X = [self.twt2tokenized_text[seg_id] for seg_id in X]
        #print (X, len(X)) 
        X, X_lengths=self.get_padded(X)
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        self.hidden = self.init_hidden(len(X))
        # batch_size, seq_len = len(X), len(X[0])
        #print ("X, X_lengths", X, X_lengths)

        #if torch.cuda.is_available():
        if self.cuda_available:
            X = self.word_embeddings(Variable(torch.cuda.LongTensor(np.array(X))))
        else:
            X = self.word_embeddings(Variable(torch.LongTensor(np.array(X))))

        seg_sort_index = sorted(range(len(X_lengths)), key=lambda k: X_lengths[k], reverse=True)
        seg_sort_index_map = {old: new for new, old in enumerate(seg_sort_index)}
        reverse_seg_index = [seg_sort_index_map[i] for i in range(len(seg_sort_index))]
        reverse_seg_index_var = torch.LongTensor(reverse_seg_index)

        #if torch.cuda.is_available():
        if self.cuda_available:
            #X_lengths = X_lengths.cuda()
            reverse_seg_index_var = reverse_seg_index_var.cuda()

        seg_lengths_sort = sorted(X_lengths, reverse=True)
        # de-concat the document sentences in the whole batch
        #print X
        X_sort = torch.cat([X[i].unsqueeze(0) for i in seg_sort_index], 0)
        #print (X_sort, seg_lengths_sort)
        #print ("X_sort, seg_lengths_sort", len(X_sort), len(seg_lengths_sort))
        X = torch.nn.utils.rnn.pack_padded_sequence(X_sort, seg_lengths_sort, batch_first=True)

        # now run through LSTM
        X, self.hidden = self.lstm(X, self.hidden)

        # undo the packing operation
        X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        
        #print list(X.size()), reverse_seg_index_var

        #if torch.cuda.is_available():
        if self.cuda_available:
            seg_embeds = torch.index_select(X, 0, reverse_seg_index_var).sum(1) / torch.cuda.FloatTensor(array(X_lengths)).view((-1, 1))
        else:
            seg_embeds = torch.index_select(X, 0, reverse_seg_index_var).sum(1) / torch.FloatTensor(array(X_lengths)).view((-1, 1))

        return seg_embeds


In [None]:
#######This is our model for embedding

class Embedder(torch.nn.Module):
    def __init__(self, cuda_available=True):
        super(Embedder, self).__init__()

        with open('data/graph_embd_yoga/data_des_net.pickle', 'rb') as in_file:
            [self.graph, self.id2name, self.name2id, self.type_start, self.type_end, \
             self.user_start, self.user_end, self.des_start, self.des_end, \
             self.annotated_des,  self.annotated_user, \
             self.twt2tokenized_text, self.weights_matrix, self.name2text, self.gt_user]=pickle.load(in_file, encoding="bytes")
        

            
            
        self.type_count=self.type_end-self.type_start+1
        #print('self.type_count', self.type_count) #5
        self.user_count=self.user_end-self.user_start+1
        #print('self.user_count', self.user_count) #35855
        self.des_count=self.des_end-self.des_start+1
        #print('self.des_count', self.des_count) #5387
        #self.loc_count=self.loc_end-self.loc_start+1
        #print('self.loc_count', self.loc_count) #2039
        #self.indicator_label_count=self.indicator_label_end-self.indicator_label_start+1
        #print('self.indicator_label_count', self.indicator_label_count) #1

        self.cuda_available=cuda_available
        self.embedding_size = 300


        self.scale = 1.0 / math.sqrt(self.embedding_size)
        
        self.embed_rng = np.random.RandomState(1)
        
        if self.cuda_available:
            self.utype_embeddings = nn.Embedding(self.type_count, self.embedding_size).cuda()
            self.des_embeddings = nn.Embedding(self.des_count, self.embedding_size).cuda()
            #self.loc_embeddings = nn.Embedding(self.loc_count, self.embedding_size).cuda()
            self.net_embeddings = nn.Embedding(self.user_count, self.embedding_size).cuda()
            

        else:
            self.utype_embeddings = nn.Embedding(self.type_count, self.embedding_size)
            self.des_embeddings = nn.Embedding(self.des_count, self.embedding_size)
            #self.loc_embeddings = nn.Embedding(self.loc_count, self.embedding_size)
            self.net_embeddings = nn.Embedding(self.user_count, self.embedding_size)
        
        
        type_embed_values = self.embed_rng.normal(scale=self.scale, size=(self.type_count, self.embedding_size))
        des_embed_values = self.embed_rng.normal(scale=self.scale, size=(self.des_count, self.embedding_size))
        #loc_embed_values = self.embed_rng.normal(scale=self.scale, size=(self.loc_count, self.embedding_size))
        net_embed_values = self.embed_rng.normal(scale=self.scale, size=(self.user_count, self.embedding_size))
        
        self.utype_embeddings.weight.data.copy_(torch.from_numpy(type_embed_values))
        self.des_embeddings.weight.data.copy_(torch.from_numpy(des_embed_values))
        #self.loc_embeddings.weight.data.copy_(torch.from_numpy(loc_embed_values))
        self.net_embeddings.weight.data.copy_(torch.from_numpy(net_embed_values))
        
    
    
        if self.cuda_available:
            self.user_embeddings = BLSTM(self.cuda_available).cuda()
        else:
            self.user_embeddings = BLSTM(self.cuda_available)


        self.CrossEntropyLoss = nn.CrossEntropyLoss(reduction='mean')

        #self.aspects = ['ad2stance', 'ad2funding_entity', 'ad2issue_bigram',\
                        #'funding_entity2stance', 'issue_bigram2issue', 'issue2indicator_label']
        
        self.aspects = ['user2type', 'user2des', 'user2mention',\
                        'des2type']

                        
        #print('self.aspects ', self.aspects )                
    def decision(self, probability):
        return random.random() < probability

    def forward(self, batch):
        #print("batch", batch)
        batch_input_index = batch[0]
        #print("batch_input_index", batch_input_index) #len = 6, dictionary
        batch_gold_index = batch[1]
        #print("batch_gold_index", batch_gold_index)
        batch_negs_index = batch[2]
        #print("batch_negs_index", batch_negs_index)
        text_nodes = batch[3]
        #print('text_nodes', text_nodes, type(text_nodes), len(text_nodes)) #list #30
#         text_ele = [self.twt2tokenized_text[seg_id] for seg_id in text_nodes_orig]
#         #print('text_ele', text_ele, len(text_ele))
# #         for sentence in text_ele:
# #             print('embed sent', sentence)
#         text_nodes = []
#         for i in range (0, len(text_nodes_orig)):
#             for sentence in text_ele:
#                 if len(sentence) > 0: 
#                     text_nodes.append(text_nodes_orig[i])
#         print('text_nodes', text_nodes, type(text_nodes), len(text_nodes))
        
        
       
        if len(text_nodes) > 0:
            textid2index = {}
            
#             for i in range(0, len(text_nodes)):
#                 textid2index[text_nodes[i]] = i
            if self.cuda_available:
                '''
                hlstm only
                '''
                output_doc_embeddings = self.user_embeddings(text_nodes).cuda()
                #print("output_doc_embeddings", output_doc_embeddings)
                #print("size output_doc_embeddings", output_doc_embeddings.size())

            else:
                '''
                hlstm only
                '''
                output_doc_embeddings = self.user_embeddings(text_nodes)
                #print("size output_doc_embeddings", output_doc_embeddings.size())
            
            for i in range(0, output_doc_embeddings.size(0)):
                textid2index[text_nodes[i]] = i
                


        embedding_size = self.embedding_size
        batch_target_index = {}
        batch_input = {}
        batch_target = {}
        input_embed = {}
        target_embed = {}
        sim_score = {}
        loss_all = {}


        loss = 0
        
        for aspect in self.aspects:
            #print('loop aspect', aspect)
            #print("len issue2indicator_label : ", len(batch_gold_index['issue2indicator_label'])) # 0 why??
            #print("len funding_entity2stance : ", len(batch_gold_index['funding_entity2stance'])) #0
            if (len(batch_gold_index[aspect]) == 0):
                continue


            if (aspect == 'user2type'):
                target_embeddings = self.utype_embeddings
            elif (aspect == 'user2des'):
                target_embeddings = self.des_embeddings
#             elif (aspect == 'user2loc'):
#                 target_embeddings = self.loc_embeddings
            elif (aspect == 'user2mention'):
                target_embeddings = self.net_embeddings
            elif (aspect == 'des2type'):
                target_embeddings = self.utype_embeddings
                #print("target_embeddings funding_entity2stance", target_embeddings) #(5, 300)
#             elif (aspect == 'loc2type'):
#                 target_embeddings = self.utype_embeddings 
            else:
                continue


            if (aspect == 'user2type'):
                if self.cuda_available:
                    #index = [textid2index[idx] for idx in batch_input_index[aspect]]
                    index = []
                    for idx in batch_input_index[aspect]:
                        if idx in textid2index:
                            index.append(textid2index[idx])
                    batch_input[aspect] = output_doc_embeddings[index]
                    #print("ad2stance", batch_gold_index[aspect])
                    #print("self.stance_start", self.stance_start)
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.cuda.LongTensor(batch_gold_index[aspect] - self.type_start)), Variable(
                        torch.cuda.LongTensor(batch_negs_index[aspect] - self.type_start))), 1)
                else:
                    #index = [textid2index[idx] for idx in batch_input_index[aspect]]
                    index = []
                    for idx in batch_input_index[aspect]:
                        if idx in textid2index:
                            index.append(textid2index[idx])
                    batch_input[aspect] = output_doc_embeddings[index]
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.LongTensor(batch_gold_index[aspect] - self.type_start)), Variable(
                        torch.LongTensor(batch_negs_index[aspect] - self.type_start))), 1)

            elif (aspect == 'user2des'):
                if self.cuda_available:
                    #print('batch_input_index[aspect]', batch_input_index[aspect], len(batch_input_index[aspect]))
                    #print('textid2index', textid2index, len(textid2index))
                    #index = [textid2index[idx] for idx in batch_input_index[aspect]]
                    index = []
                    for idx in batch_input_index[aspect]:
                        if idx in textid2index:
                            index.append(textid2index[idx])
                    #print('index', index, len(index)) ## 0...25
                    #print('output_doc_embeddings', output_doc_embeddings.size()) # 25*300
                    batch_input[aspect] = output_doc_embeddings[index]
                    #print('batch_input ', batch_input[aspect])
                    #print("ad2funding_entity", batch_gold_index[aspect])
                    #print("self.funding_entity_start", self.funding_entity_start)
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.cuda.LongTensor(batch_gold_index[aspect] - self.des_start)), Variable(
                        torch.cuda.LongTensor(batch_negs_index[aspect] - self.des_start))), 1)
                else:
                    #print('batch_input_index ', batch_input_index[aspect], len(batch_input_index[aspect]))
                    #print('textid2index', textid2index, len(textid2index))
                    #index = [textid2index[idx] for idx in batch_input_index[aspect]]
                    index = []
                    for idx in batch_input_index[aspect]:
                        if idx in textid2index:
                            index.append(textid2index[idx])
                    #print('index', index, len(index)) ## 0...25
                    #print('output_doc_embeddings', output_doc_embeddings.size()) # 25*300
                    batch_input[aspect] = output_doc_embeddings[index]
                    #print('batch_input ', batch_input[aspect])
#                     print("ad2funding_entity", batch_gold_index[aspect])
#                     print("self.funding_entity_start", self.funding_entity_start)
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.LongTensor(batch_gold_index[aspect] - self.des_start)), Variable(
                        torch.LongTensor(batch_negs_index[aspect] - self.des_start))), 1)



            elif (aspect == 'user2mention'):
                if self.cuda_available:
                    #index = [textid2index[idx] for idx in batch_input_index[aspect]]
                    index = []
                    for idx in batch_input_index[aspect]:
                        if idx in textid2index:
                            index.append(textid2index[idx])
                    batch_input[aspect] = output_doc_embeddings[index]
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.cuda.LongTensor(batch_gold_index[aspect] - self.user_start)), Variable(
                        torch.cuda.LongTensor(batch_negs_index[aspect] - self.user_start))), 1)
                else:
                    #index = [textid2index[idx] for idx in batch_input_index[aspect]]
                    index = []
                    for idx in batch_input_index[aspect]:
                        if idx in textid2index:
                            index.append(textid2index[idx])
                    batch_input[aspect] = output_doc_embeddings[index]
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.LongTensor(batch_gold_index[aspect] - self.user_start)), Variable(
                        torch.LongTensor(batch_negs_index[aspect] - self.user_start))), 1)
                    
            elif (aspect == 'des2type'):
                if self.cuda_available:
                    batch_input[aspect] = self.des_embeddings(
                        Variable(torch.cuda.LongTensor(batch_input_index[aspect] - self.des_start))).view(
                        (-1, self.embedding_size))
                    #print("funding_entity2stance", batch_gold_index[aspect])
                    #print("self.stance_start", self.stance_start)
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.cuda.LongTensor(batch_gold_index[aspect] - self.type_start)), Variable(
                        torch.cuda.LongTensor(batch_negs_index[aspect] - self.type_start))), 1)
                else:
                    batch_input[aspect] = self.des_embeddings(
                        Variable(torch.LongTensor(batch_input_index[aspect] - self.des_start))).view(
                        (-1, self.embedding_size))
                    batch_target_index[aspect] = torch.cat((Variable(
                        torch.LongTensor(batch_gold_index[aspect] - self.type_start)), Variable(
                        torch.LongTensor(batch_negs_index[aspect] - self.type_start))), 1)


            if aspect in ['user2des', 'user2mention']:
                #print("aspect", aspect)
                example_size = 5 + 1
                #print("batch_target_index funding_entity2stance", len(batch_target_index) )
                #print("batch_target_index[funding_entity2stance]", batch_target_index[aspect])

            else:
                #print("else ", aspect)
                example_size = 2 + 1

            #if aspect in batch_target_index :
            batch_target[aspect] = target_embeddings(batch_target_index[aspect]).view((-1, example_size, self.embedding_size))
            #print("batch_target ", batch_target[aspect], batch_target[aspect].size())
            dropout = nn.Dropout(p=0.7)
            input_layers = [dropout(batch_input[aspect])]
            input_embed[aspect] = input_layers[-1]

            target_layers = [dropout(batch_target[aspect])]
            target_embed[aspect] = target_layers[-1]
            
#             print("\n * target_embed[aspect] \n ")
            #print('target_embed', target_embed[aspect].size()) #torch.Size([13, 6, 300]), torch.Size([3, 3, 300]), torch.Size([15, 6, 300])

#             print(target_embed[aspect].device)

#             print("\n * input_embed[aspect] \n ")
#             print('input_embed', input_embed[aspect].size()) #torch.Size([13, 300]), torch.Size([3, 300]), , torch.Size([15, 300])
#             print(input_embed[aspect].device)
            #print("input_embed[aspect].view ", input_embed[aspect].view(-1, embedding_size, 1).size()) #torch.Size([1, 300, 1]), torch.Size([12, 300, 1]), torch.Size([14, 300, 1]) ??

            #print('bmm', torch.bmm(target_embed[aspect],input_embed[aspect].view(-1, embedding_size, 1)).size())
            #print('example_size', example_size)
            #print('bmm view', torch.bmm(target_embed[aspect],input_embed[aspect].view(-1, embedding_size, 1)).view(-1, example_size).size())
            sim_score[aspect] = torch.bmm(
                target_embed[aspect],
                input_embed[aspect].view(-1, embedding_size, 1)).view(-1, example_size)

            if self.cuda_available:
                target = Variable(torch.cuda.LongTensor(batch_input[aspect].size(0)).zero_())
            else:
                target = Variable(torch.LongTensor(batch_input[aspect].size(0)).zero_())

            self.CrossEntropyLoss = nn.CrossEntropyLoss(reduction='mean')
            
            #print('sim_score ', sim_score[aspect], sim_score[aspect].size())
            #print('target ', target, target.size())
            loss_all[aspect] = self.CrossEntropyLoss(sim_score[aspect], target)

            loss_all[aspect] = torch.sum(loss_all[aspect])

            rate = 1.0

            loss += loss_all[aspect] * rate
            #print("loop loss", loss)
        
        return loss, loss_all
    
    def chunks(self, l, n):
        n = max(1, n)
        return (l[i:i + n] for i in range(0, len(l), n))

    def dot_product(self, x, y):
        x = np.array(x)
        y = np.array(y)
        return np.dot(x, y)

    def predict_utype(self):
        batch_input_index = [id for id in self.gt_user]
        batch_gold=[self.gt_user[id] for id in self.gt_user]

        utype_embd = self.utype_embeddings.weight.cpu().data.numpy()
        utype_embd = utype_embd.tolist()
        batch_input_index = self.chunks(batch_input_index, 30)
        doc_embds = []
        for chunk in batch_input_index:
            embd = self.user_embeddings(chunk)
            embd = (embd.cpu().data.numpy()).tolist()
            doc_embds += embd

        predictions = []
        for i in range(0, len(doc_embds)):
            d_embd = doc_embds[i]
            scores = []
            for l in range(0, len(utype_embd)):
                l_embd = utype_embd[l]
                scores.append(self.dot_product(d_embd, l_embd))

            predictions.append([scores.index(max(scores))])


        correct=0
        #preds = []
#         print('len(predictions)', len(predictions), type(predictions))
#         print('predictions[i]', predictions[0], predictions[1], type(predictions[0]), type (predictions[1]),  predictions[0][0], type(predictions[0][0]))
#         print('len(batch_gold)', len(batch_gold))
#         print('batch_gold[i]', batch_gold[0], batch_gold[1], type(batch_gold[0]), type(batch_gold[1]))
        for i in range(len(predictions)):
            #if len(set(predictions[i]) & set(batch_gold[i]))>0:
            if predictions[i][0] == batch_gold[i] :
                correct+=1
            #preds.append(predictions[i][0])


        print ("accuracy:", correct/float(len(predictions)))
        from sklearn.metrics import f1_score
        macro_f1 = f1_score(batch_gold, predictions, average='macro')
        print("macro_f1", macro_f1)
        
    def predict_utype_all_des(self):
        des_embds=self.des_embeddings.weight.cpu().data.numpy()
        des_embds=des_embds.tolist()
            
        utype_embd = self.utype_embeddings.weight.cpu().data.numpy()
        utype_embd = utype_embd.tolist()


        des2utype={}
        for i in range(0, len(des_embds)):
            des_embd = des_embds[i]
            scores = []
            for l in range(0, len(utype_embd)):
                l_embd = utype_embd[l]
                scores.append(self.dot_product(des_embd, l_embd))

            utype2score={}
            for j in range(len(scores)):
                utype2score[self.id2name[j+self.type_start]]=scores[j]

            sorted_utype={k: v for k, v in sorted(utype2score.items(), key=lambda item: item[1], reverse=True)}

            des2utype[self.id2name[i+self.des_start]]=sorted_utype

        return des2utype

    def predict_utype_all_users(self):
        _batch_input_index = [id for id in range(self.user_start, self.user_end + 1)]
        # batch_gold = [self.ad2annotated_issue[id] for id in self.ad2annotated_issue]

        utype_embd = self.utype_embeddings.weight.cpu().data.numpy()
        utype_embd = utype_embd.tolist()
        batch_input_index = self.chunks(_batch_input_index, 30)
        user_embds = []
        for chunk in batch_input_index:
            embd = self.user_embeddings(chunk)
            embd = (embd.cpu().data.numpy()).tolist()
            user_embds += embd

        user2utype = {}
        for i in range(0, len(user_embds)):
            user_embd = user_embds[i]
            scores = []
            for l in range(0, len(utype_embd)):
                l_embd = utype_embd[l]
                scores.append(self.dot_product(user_embd, l_embd))

            utype2score = {}
            for j in range(len(scores)):
                utype2score[self.id2name[j + self.type_start]] = scores[j]

            sorted_utype = {k: v for k, v in sorted(utype2score.items(), key=lambda item: item[1], reverse=True)}

            user2utype[self.id2name[_batch_input_index[i]]] = sorted_utype

        return user2utype



In [None]:
NID=NodeInputData(batch_size=16)
print ("Creating batches...")
NID.get_nodes(NID.graph)
print ("Batches Created!")
# global cuda_available
cuda_available = torch.cuda.is_available()
# #print(cuda_available)

print ("Initializing the model...")

model = Embedder(cuda_available=True)
#model = Embedder(cuda_available=torch.cuda.is_available())
best_model = Embedder(cuda_available=True)
model2 = Embedder(cuda_available=True)

print ("Model Initiated!")
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)

print ("Total Batches: %d"%len(NID.batches)) #Total Batches: 434 if 100, Total Batches: 1354 if 32

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print (name)
        
# for name, param in model1.named_parameters():
#     if param.requires_grad:
#         print (name)

min_loss=10000000
no_up=0
model.train()
for epoch in range(0, 100): #initiate number of epoches
    start=time.time()
    average_loss=0
    count=0
    for batch in NID.batches:
        #print("i", i)
        #print ("batch" , batch, type(batch), len(batch)) # list, 4
        #print ("batch len" , len(batch[0]), len(batch[1]), len(batch[2]), len(batch[3])) # 5 5 5 9, 5 5 5 16
        #print ("gold batch" , batch[1], type(batch[1]), len(batch[1])) #<class 'dict'>, 6
        optimizer.zero_grad()
        l,loss=model(batch)
        #print("l for loss on epoch : ", l, epoch)
        if l==0:
            continue
            
        l.backward()
        optimizer.step()
        #average_loss+=l.data[0]
        average_loss+=l.data

    end=time.time()
    print ("Total Time for epoch: %d: %lf"%(epoch, float(end-start)))
    print ("Loss at epoch %d = %f"%(epoch, average_loss))

    if average_loss<min_loss:
        min_loss=average_loss
        #torch.save(best_model.state_dict(), "best_model.m")
        best_model.load_state_dict(model.state_dict())

        no_up=0
    else:
        no_up+=1
    if no_up==10:
        break
torch.save(best_model.state_dict(), "data/graph_embd_yoga/output/des_net/b_des_net_model.m")    
NID.save_embeddings(best_model)

best_model.eval()
best_model.predict_utype()
#best_model.predict_issue()
best_model.eval()

user2utype=best_model.predict_utype_all_users()
#ad2issue=best_model.predict_issue_all_ad()
des2utype=best_model.predict_utype_all_des()


#output_dir='../../scratch/fb_ads_data/output/other/'
with open('data/graph_embd_yoga/output/des_net/predicted_utypes_des_net.pickle', 'wb') as out_file:
    pickle.dump([user2utype, des2utype], out_file)



In [None]:
### Load only best model (saved)
NID=NodeInputData(batch_size=16)
print ("Creating batches...")
NID.get_nodes(NID.graph)
print ("Batches Created!")

print ("Initializing the model...")
cuda_available = torch.cuda.is_available()
#best_model = Embedder(cuda_available=True)
best_model = Embedder(cuda_available=True)
best_model.load_state_dict(torch.load("data/graph_embd_yoga/output/des_net/b_des_net_model.m"))

NID.save_embeddings(best_model)

best_model.eval()
best_model.predict_utype()
best_model.eval()

user2utype=best_model.predict_utype_all_users()
#ad2issue=best_model.predict_issue_all_ad()
des2utype=best_model.predict_utype_all_des()


#output_dir='../../scratch/fb_ads_data/output/other/'
with open('data/graph_embd_yoga/output/des_net/predicted_utypes_des_net.pickle', 'wb') as out_file:
    pickle.dump([user2utype, des2utype], out_file)


# des + net: accuracy: 0.7811704834605598, macro_f1 0.7533442802408319