In [1]:
from pymongo import MongoClient
import pymongo
import pandas as pd
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
import pickle

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Add, Lambda, Concatenate
from tensorflow.keras.models import Model
import functools
import tensorflow.keras.backend as K
from scipy.sparse import dok_matrix
import random


seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.compat.v1.set_random_seed(seed_value)

# Data Sequence Generators

In [3]:
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from spektral.utils import normalized_adjacency
import tensorflow as tf
from tensorflow.keras.utils import Sequence
import random
import gridfs
import math
import functools

    
class TwitterDataset(Sequence):
    
    def __init__(self, user_id, users,
                 replies, mentions, retweets, full_graph, graph_test,
                 max_tweets, batch_size, date_limit, db, filename='user_tweets.np'):
        self.users_id = user_id
        self.id_users = [None] * len(self.users_id)
        for u_id, idx in self.users_id.items():
            self.id_users[idx] = u_id
        self.graph_replies = replies
        self.graph_mentions = mentions
        self.graph_retweets = retweets
        self.graph_full = full_graph
        self.max_tweets = max_tweets
        self.batch_size = batch_size
        self.valid_users = list()
        self.target_users = list(user_id.keys())
        self.target_users.sort()
        for u in self.target_users:
            if u not in graph_test.nodes:
                continue
            if len(list(graph_test.neighbors(u))) > 0:
                self.valid_users.append(u)
        self.valid_users.sort()
        #empty tweet representation
        #bert_model = TFBertModel.from_pretrained("bert-base-uncased")
        #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
        #self.empty_tweet =  bert_model(**tokenizer('', return_tensors='tf'))['pooler_output'].numpy()
        self.empty_tweet = None
        self.date_limit = date_limit
        self.gridfs = gridfs.GridFS(db, collection='fsProcessedTweets')
        #del bert_model
        #del tokenizer
        self.filename = filename
        self._init_tweet_cache()
        self.current_target = -1
        self.batch_per_pass = math.ceil(len(self.target_users)/ self.batch_size)
        pass
        
    def create_data(self):
        self.user_data = []
        print('Preprocessing batchs')
        for i in tqdm(range(0, len(self.valid_users))):
            self.user_data.append(self._get_instance(self.valid_users[i]))
            #data = [self._get_instance(self.valid_users[i])]
            #max_users = max([len(instance[0]) for instance in data]) 
            #self.user_data.append(self._to_batch(data, max_users))
        self.internal_get_item = self.internal_get_item_cache
        pass
               
    def _init_tweet_cache(self):
        if not os.path.exists('training_tweets.npy'):
            self.tweets = np.zeros((len(self.id_users), 768), dtype=np.float32)
            for i, t in tqdm(enumerate(self.id_users), total=len(self.id_users)):
                self.tweets[i, ...] = self._get_tweets_bert_base(t)
            np.save('training_tweets.npy', self.tweets)
            return
        self.tweets = np.load('training_tweets.npy')
        self.tweets = np.mean(self.tweets, axis=1)
        pass
        
    def __len__(self):
        return len(self.valid_users) *  self.batch_per_pass
    
    def _get_graph_for_node(self, node):
        user = node#self.user_id[node]
        node_map = {user: 0}
        #Maps all the 1-level node to create the matrix
        for neighbor in self.graph_replies.neighbors(node):
            if neighbor not in node_map:
                node_map[neighbor] = len(node_map)
        for neighbor in self.graph_mentions.neighbors(node):
            if neighbor not in node_map:
                node_map[neighbor] = len(node_map)
        for neighbor in self.graph_retweets.neighbors(node):
            if neighbor not in node_map:
                node_map[neighbor] = len(node_map)
        #Creates the 3 matrixes
        replies = np.eye(len(node_map))
        mentions = np.eye(len(node_map))
        retweets = np.eye(len(node_map))
        #creates the Â matrix for the key node 
        for node, node_id in node_map.items():
            for neighbor in self.graph_replies.neighbors(node):
                if neighbor in node_map:
                    replies[node_id, node_map[neighbor]] = 1
                    
            for neighbor in self.graph_mentions.neighbors(node):
                if neighbor in node_map:
                    mentions[node_id, node_map[neighbor]] = 1
                    
            for neighbor in self.graph_retweets.neighbors(node):
                if neighbor in node_map:
                    retweets[node_id, node_map[neighbor]] = 1
        replies = normalized_adjacency(replies)
        mentions = normalized_adjacency(mentions)
        retweets = normalized_adjacency(retweets)
        #Create the embedding vector
        embeddings = np.zeros((len(node_map)))
        for k, v in node_map.items():
            #Convert the tweeter user id to the id acording to the nn
            embeddings[v] = self.users_id[k] 
        return embeddings, replies, mentions, retweets
    
    def _get_tweets_bert(self, node):
        idx = int(node)
        return self.tweets[idx, ...]

    def _get_tweets_bert_db(self, node):
        user_id = node
        query = {'userId': int(user_id)}
        if self.date_limit is not None:
            query['created'] = {'$lte': self.date_limit}
        cursor = (
            self.gridfs.
            find(query).
            sort([('created', pymongo.DESCENDING)]).
            limit(self.max_tweets)
        )
        result = np.empty((self.max_tweets, 768))
        i = 0
        for file in cursor:
            result[i, :] = np.load(file)['pooler_output']
            i += 1
        while i < self.max_tweets:
            result[i, :] = self.empty_tweet
            i += 1
        return result
    
    def _get_instance(self, node):
        embeddings, replies, mentions, retweets = self._get_graph_for_node(node)
        return embeddings, replies[:1, :], mentions[:1, :], retweets[:1, :]
    
    def _to_batch(self, instances, max_users, batch_size):
        user_i = np.zeros((batch_size, max_users))
        user_replies = np.zeros((batch_size, 1, max_users))
        user_mentions = np.zeros((batch_size, 1, max_users))
        user_retweet = np.zeros((batch_size, 1, max_users))
        for i, (embeddings, replies, mentions, retweets) in enumerate(instances):
            user_i[i, :embeddings.shape[0]] = embeddings
            user_replies[i, :replies.shape[0], :replies.shape[1]] = replies
            user_mentions[i, :mentions.shape[0], :mentions.shape[1]] = mentions
            user_retweet[i, :retweets.shape[0], :retweets.shape[1]] = retweets
        return [user_i, user_replies, user_mentions, user_retweet]
    
    def _to_batch_single(self, instance, repeat):
        user_i = instance[0]
        user_replies = instance[1]
        user_mentions = instance[2]
        user_retweet = instance[3]
        user_i = np.expand_dims(user_i, axis=0)
        user_replies = np.expand_dims(user_replies, axis=0)
        user_mentions = np.expand_dims(user_mentions, axis=0)
        user_retweet = np.expand_dims(user_retweet, axis=0)
        user_i = np.repeat(user_i, repeat, axis=0)
        user_replies = np.repeat(user_replies, repeat, axis=0)
        user_mentions = np.repeat(user_mentions, repeat, axis=0)
        user_retweet = np.repeat(user_retweet, repeat, axis=0)
        return [user_i, user_replies, user_mentions, user_retweet]
    
    def internal_get_item_cache(self, idx):
        current_user = idx % len(self.valid_users)
        current_target = idx // len(self.valid_users)
        if current_target != self.current_target: 
            target_list = self.target_users[current_target * self.batch_size : 
                                            (current_target + 1) * self.batch_size]
            target_list = [self._get_instance(idx) for idx in target_list]
            max_user = max([len(instance[0]) for instance in target_list])
            self.current_target_data = self._to_batch(target_list, max_user, len(target_list))
            self.current_target = current_target
        target_batch = self.current_target_data
        #Busco los datos y lo hago crecer al tamaño del batch
        user_data =  self._to_batch_single(self.user_data[current_user], target_batch[0].shape[0])
        return user_data + target_batch
    
    def internal_get_item(self, idx):
        current_user = self.valid_users[idx % len(self.valid_users)]
        current_target = idx // len(self.valid_users)
        if current_target != self.current_target: 
            target_list = self.target_users[current_target * self.batch_size : 
                                            (current_target + 1) * self.batch_size]
            target_list = [self._get_instance(idx) for idx in target_list]
            max_user = max([len(instance[0]) for instance in target_list])
            self.current_target_data = self._to_batch(target_list, max_user, len(target_list))
            self.current_target = current_target
        target_batch = self.current_target_data
        #Busco los datos y lo hago crecer al tamaño del batch
        user_data =  self._to_batch_single(self._get_instance(current_user), target_batch[0].shape[0])
        return user_data + target_batch
    
    def __getitem__(self, idx):
        return self.internal_get_item(idx)
    
    def gen_users_pairs(self, idx):
        current_user = idx % len(self.valid_users)
        current_target = idx // len(self.valid_users)
        target_list = self.target_users[current_target * self.batch_size : 
                                        (current_target + 1) * self.batch_size]
        current_user = self.valid_users[current_user]
        return [(current_user, d) for d in target_list]
            

In [4]:
max_tweets = 15
batch_size = 50
with open('test_ds.pickle', 'rb') as f:
    dataset = pickle.load(f)
user_id = dataset.users_id

In [5]:
for i in dataset[0]:
    print(i.shape)

(500, 25)
(500, 1, 25)
(500, 1, 25)
(500, 1, 25)
(500, 341)
(500, 1, 341)
(500, 1, 341)
(500, 1, 341)


# Neural Network

In [6]:
from transformers import BertTokenizer, TFBertModel, BertConfig
from tensorflow.keras.layers import LSTM, Bidirectional, Input, Embedding, Concatenate, \
                TimeDistributed, Lambda, Dot, Attention, GlobalMaxPool1D, Dense
from tensorflow.keras.models import Model
from spektral.layers.convolutional import GCNConv
import tensorflow as tf


def loss(y_true, y_pred):
    #recibe indices con forma 1xvaloresx3 (indices + valor)
    #trasnforma los indices a valoresx2 y los valores valoresx1
    v_true, dist = y_true[:, 0], y_true[:, 1]
    return K.mean(dist * K.square(y_pred - K.log(2 * v_true) / K.log(2.0)))

In [7]:
emb_size = 64
kernels = 32
deep = 1

embedded = Embedding(len(user_id), emb_size, name='user_embeddings')

user_i = Input(shape=(None,), name='user_list', dtype=tf.int32)
emb_user = embedded(user_i)

target_i = Input(shape=(None,), name='target_list', dtype=tf.int32)
emb_target = embedded(target_i)

replies_user_i = Input(shape=(None, None), name='replies_user', dtype=tf.float32)
mentions_user_i = Input(shape=(None, None), name='mentions_user', dtype=tf.float32)
retweets_user_i = Input(shape=(None, None), name='retweets_user', dtype=tf.float32)

replies_target_i = Input(shape=(None, None), name='replies_target', dtype=tf.float32)
mentions_target_i = Input(shape=(None, None), name='mentions_target', dtype=tf.float32)
retweets_target_i = Input(shape=(None, None), name='retweets_target', dtype=tf.float32)

user_emb = emb_user
target_emb = emb_target

emb_rep, emb_men, emb_rt = user_emb, user_emb, user_emb
emb_t_rep, emb_t_men, emb_t_rt = target_emb, target_emb, target_emb
for i in range(deep):
    emb_rep = GCNConv(kernels, name='gcn_replies_{}'.format(i))([emb_rep, replies_user_i])
    emb_men = GCNConv(kernels, name='gcn_mentions_{}'.format(i))([emb_men, mentions_user_i])
    emb_rt = GCNConv(kernels, name='gcn_retweets_{}'.format(i))([emb_rt, retweets_user_i])
    
    emb_t_rep = GCNConv(kernels, name='gcn_t_replies_{}'.format(i))([emb_t_rep, replies_target_i])
    emb_t_men = GCNConv(kernels, name='gcn_t_mentions_{}'.format(i))([emb_t_men, mentions_target_i])
    emb_t_rt = GCNConv(kernels, name='gcn_t_retweets_{}'.format(i))([emb_t_rt, retweets_target_i])
    
mat = Concatenate(name='user_gnc')([emb_rep, emb_men, emb_rt])
mat = Lambda(lambda x: x[:, 0, :], name='user_row')(mat)

mat_t = Concatenate(name='target_gnc')([emb_t_rep, emb_t_men, emb_t_rt])
mat_t = Lambda(lambda x: x[:, 0, :], name='target_row')(mat_t)
#Wide 
user_wide = Lambda(lambda x: x[:, 0, :], name='user_wide')(emb_user) 
target_wide = Lambda(lambda x: x[:, 0, :], name='target_wide')(emb_target) 
wide = Concatenate(name='reps_concat')([user_wide, target_wide])
wide = Dense(1)(wide)
#Falta unir con bert
mat = Concatenate(name='graph_reps_concat')([mat, mat_t])
mat = Dense(kernels)(mat)#, [0, 2, 1]
mat = Dense(1)(mat)
mat = mat + wide
model = Model([user_i, replies_user_i, mentions_user_i, retweets_user_i,
              target_i, replies_target_i, mentions_target_i, retweets_target_i], mat)

model.summary()

model.compile(loss=loss, optimizer='adam')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_list (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
target_list (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
user_embeddings (Embedding)     (None, None, 64)     412800      user_list[0][0]                  
                                                                 target_list[0][0]                
__________________________________________________________________________________________________
replies_user (InputLayer)       [(None, None, None)] 0                                        

In [8]:
model.load_weights('connected-neg-no-bert/model_rec-neg.h5')

In [9]:
import csv

class OffsetLimitedDs(Sequence):
    
    def __init__(self, ds, offset, limit):
        self.ds = ds
        self.offset = offset
        self.limit = limit
        self.len = min(self.limit, len(self.ds) - self.offset)
        
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        return self.ds[idx + self.offset]
    

partial = 100
with open('connected-neg-no-bert/predictions-neg.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Origin', 'Destiny', 'Prediction'])
    for offset in tqdm(range(0, len(dataset), partial)):
        c_ds = OffsetLimitedDs(dataset, offset, partial)
        pred = model.predict(c_ds, max_queue_size=10)
        j = 0
        for i in range(len(c_ds)):
            pairs =  dataset.gen_users_pairs(offset + i)
            for o, d in pairs:
                csvwriter.writerow([o, d, pred[j, 0]])
                j = j + 1
            
print(len(dataset))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=475.0), HTML(value='')))


47476
