In [1]:
from pymongo import MongoClient
import pymongo
import math
import pandas as pd
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
import pickle
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Add, Lambda, Concatenate
from tensorflow.keras.models import Model
import functools
import tensorflow.keras.backend as K
from scipy.sparse import dok_matrix
import random


seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.compat.v1.set_random_seed(seed_value)

# Data Sequence Generators

In [2]:
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Sequence
from spektral.utils import normalized_adjacency
import tensorflow as tf
import random
import gridfs
import functools


def get_target(users, graph, cos, user_id):
    target = dict()
    set_users = set(users)
    for u in users:
        for d in graph.neighbors(u):
            if d in set_users:
                target[(u, d)] = graph.edges[(u, d)]['weight']
    del set_users

    indices = np.asarray(list(target.keys()))
    v_true = np.asarray([v for v in target.values()])
    sim = np.asarray([cos[user_id[link[0]], user_id[link[1]]] for link in target.keys()])
    mean = np.mean(sim)
    std = np.std(sim)
    dist = 1 - (sim - (mean - 2 * std)) / (4 * std)
    full_dist = 1 - (cos - (mean - 2 * std)) / (4 * std)
    return indices, v_true, np.clip(dist, 0.1, 0.9), np.clip(full_dist, 0.1, 0.9)

    
class TwitterDataset(Sequence):
    
    def __init__(self, user_id, users,
                 replies, mentions, retweets, full_graph,
                 cos, max_tweets, batch_size, date_limit, db,
                 neg_sample=1):
        self.users_id = user_id
        self.id_users = [0] * len(self.users_id)
        for k, v in user_id.items():
            self.id_users[v] = k 
        self.graph_replies = replies
        self.graph_mentions = mentions
        self.graph_retweets = retweets
        self.graph_full = full_graph
        self.center_users = [u for u in self.graph_full.nodes if self.graph_full.nodes[u]['central']]
        self.center_users.sort()
        self.user_pairs, self.y_true, \
        self.y_dist, self.distance = get_target(users, self.graph_full, cos, user_id)
        self.idx_random = list(range(len(self.user_pairs)))
        random.shuffle(self.idx_random)
        self.max_tweets = max_tweets
        self.batch_size = batch_size
        #empty tweet representation
        bert_model = TFBertModel.from_pretrained("bert-base-uncased")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
        self.empty_tweet =  bert_model(**tokenizer('', return_tensors='tf'))['pooler_output'].numpy()
        del bert_model
        del tokenizer
        self.date_limit = date_limit
        self.gridfs = gridfs.GridFS(db, collection='fsProcessedTweets')
        self._init_tweet_cache()
        self.neg_sample_batch = neg_sample + 1
        pass
        
    def _init_tweet_cache(self):
        if not os.path.exists('training_tweets.npy'):
            self.tweets = np.zeros((len(self.id_users), self.max_tweets, 768), dtype=np.float32)
            for i, t in tqdm(enumerate(self.id_users), total=len(self.id_users)):
                self.tweets[i, ...] = self._get_tweets_bert_base(t)
            np.save('training_tweets.npy', self.tweets)
            return
        self.tweets = np.load('training_tweets.npy')
        self.tweets = np.mean(self.tweets, axis=1)
        pass
    
    def __len__(self):
        return self.neg_sample_batch * math.ceil(len(self.idx_random) / self.batch_size)
    
    def _get_graph_for_node(self, node):
        user = node#self.user_id[node]
        node_map = {user: 0}
        #Maps all the 1-level node to create the matrix
        for neighbor in self.graph_replies.neighbors(node):
            if neighbor not in node_map:
                node_map[neighbor] = len(node_map)
        #Creates the 3 matrixes
        replies = np.eye(len(node_map))
        #creates the Â matrix for the key node 
        for node, node_id in node_map.items():
            for neighbor in self.graph_replies.neighbors(node):
                if neighbor in node_map:
                    replies[node_id, node_map[neighbor]] = 1
                    
        replies = normalized_adjacency(replies)
        #Create the embedding vector
        embeddings = np.zeros((len(node_map)))
        for k, v in node_map.items():
            #Convert the tweeter user id to the id acording to the nn
            embeddings[v] = self.users_id[k] 
        return embeddings, replies

    def _get_tweets_bert(self, node):
        return self.tweets[int(node), ...]
    
    def _get_tweets_bert_base(self, node):
        user_id = node##
        query = {'userId': int(user_id)}
        if self.date_limit is not None:
            query['created'] = {'$lte': self.date_limit}
        cursor = (
            self.gridfs.
            find(query).
            sort([('created', pymongo.DESCENDING)]).
            limit(self.max_tweets)
        )
        result = np.empty((self.max_tweets, 768))
        i = 0
        for file in cursor:
            result[i, :] = np.load(file)['pooler_output']
            i += 1
        while i < self.max_tweets:
            result[i, :] = self.empty_tweet
            i += 1
        return result
    
    def _get_instance(self, node):
        embeddings, replies = self._get_graph_for_node(node)
        bert_emb = np.empty((embeddings.shape[0], 768))
        for i, node in enumerate(embeddings):
            bert_emb[i, ...] = self._get_tweets_bert(node)
        return embeddings, replies, bert_emb
    
    def _to_batch(self, instances, max_users):
        user_i = np.zeros((batch_size, max_users))
        user_replies = np.zeros((batch_size, max_users, max_users))
        user_bert = np.zeros((batch_size, max_users, 768))
        for i, (embeddings, replies, bert_emb) in enumerate(instances):
            user_i[i, :embeddings.shape[0]] = embeddings
            user_replies[i, :replies.shape[0], :replies.shape[1]] = replies
            user_bert[i, :bert_emb.shape[0], ...] = bert_emb
        return [user_i, user_replies, user_bert]
    
    def gen_neg_sample(self):
        users = random.sample(self.center_users, self.batch_size)
        targets = random.sample(self.center_users, self.batch_size)
        user_instances = [self._get_instance(u) for u in users]
        target_instances = [self._get_instance(u) for u in targets]
        max_user = max([len(instance[0]) for instance in user_instances]) 
        max_target = max([len(instance[0]) for instance in target_instances])
        y = np.empty((self.batch_size, 2))
        #y[:, :] = 0.5
        y[:, 0] = 0.5
        for i, (u, t) in enumerate(zip(users, targets)):
            y[i, 1] = 1 - self.distance[self.users_id[u], self.users_id[t]]
        return self._to_batch(user_instances, max_user) + self._to_batch(target_instances, max_target), y
    
    def __getitem__(self, idx):
        if (idx % self.neg_sample_batch) != 0:
            return self.gen_neg_sample()
        idx = idx // self.neg_sample_batch
        ids = self.idx_random[idx * self.batch_size: (idx + 1) * self.batch_size]
        user_instances = [self._get_instance(self.user_pairs[idx][0]) for idx in ids]
        target_instances = [self._get_instance(self.user_pairs[idx][1]) for idx in ids]
        max_user = max([len(instance[0]) for instance in user_instances]) 
        max_target = max([len(instance[0]) for instance in target_instances])
        current_batch_size = len(ids)
        y = np.empty((current_batch_size, 2))
        y[:, 0] = self.y_true[ids]
        y[:, 1] = self.y_dist[ids]
        return self._to_batch(user_instances, max_user) + self._to_batch(target_instances, max_target), y
    
    def on_epoch_end(self):
        random.shuffle(self.idx_random)

In [3]:
max_tweets = 15
batch_size = 20
with open('train_ds.pickle', 'rb') as f:
    dataset = pickle.load(f)
user_id = dataset.users_id

dataset.batch_size = 20

In [4]:
dataset[1][1]

array([[0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.42597759],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.6734482 ],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.10000002],
       [0.5       , 0.74929076],
       [0.5       , 0.47525048],
       [0.5       , 0.10000002]])

In [5]:
for i in dataset[0][0]:
    print(i.shape)

(20, 62)
(20, 62, 62)
(20, 62, 768)
(20, 28)
(20, 28, 28)
(20, 28, 768)


# Neural Network

In [6]:
from transformers import BertTokenizer, TFBertModel, BertConfig
from tensorflow.keras.layers import LSTM, Bidirectional, Input, Embedding, Concatenate, \
                TimeDistributed, Lambda, Dot, Attention, GlobalMaxPool1D, Dense
from tensorflow.keras.models import Model
from spektral.layers.convolutional import GCNConv
import tensorflow as tf


def loss(y_true, y_pred):
    #recibe indices con forma 1xvaloresx3 (indices + valor)
    #trasnforma los indices a valoresx2 y los valores valoresx1
    v_true, dist = y_true[:, 0], y_true[:, 1]
    return K.mean(dist * K.square(y_pred - K.log(2 * v_true) / K.log(2.0)))

In [7]:
emb_size = 64
kernels = 32
deep = 1

embedded = Embedding(len(user_id), emb_size, name='user_embeddings')

user_i = Input(shape=(None,), name='user_list', dtype=tf.int32)
emb_user = embedded(user_i)

target_i = Input(shape=(None,), name='target_list', dtype=tf.int32)
emb_target = embedded(target_i)

replies_user_i = Input(shape=(None, None), name='replies_user', dtype=tf.float32)

replies_target_i = Input(shape=(None, None), name='replies_target', dtype=tf.float32)

user_tweets_bert = Input(shape=(None, 768), name='user_tweets_bert')
target_tweets_bert = Input(shape=(None, 768), name='target_tweets_bert')

user_bert = Dense(emb_size, name='user_bert_dense')(user_tweets_bert)
target_bert = Dense(emb_size, name='target_bert_dense')(target_tweets_bert)

user_emb = Concatenate(name='user_emb_plus_bert', axis=-1)([emb_user, user_bert])
target_emb = Concatenate(name='target_emb_plus_bert', axis=-1)([emb_target, target_bert])

emb_rep, emb_men, emb_rt = user_emb, user_emb, user_emb
emb_t_rep, emb_t_men, emb_t_rt = target_emb, target_emb, target_emb
for i in range(deep):
    emb_rep = GCNConv(kernels, name='gcn_replies_{}'.format(i))([emb_rep, replies_user_i])
    
    emb_t_rep = GCNConv(kernels, name='gcn_t_replies_{}'.format(i))([emb_t_rep, replies_target_i])
    
mat = emb_rep
mat = Lambda(lambda x: x[:, 0, :], name='user_row')(mat)

mat_t = emb_t_rep
mat_t = Lambda(lambda x: x[:, 0, :], name='target_row')(mat_t)
#Wide 
user_wide = Lambda(lambda x: x[:, 0, :], name='user_wide')(emb_user) 
target_wide = Lambda(lambda x: x[:, 0, :], name='target_wide')(emb_target) 
wide = Concatenate(name='reps_concat')([user_wide, target_wide])
wide = Dense(1)(wide)
#Falta unir con bert
mat = Concatenate(name='graph_reps_concat')([mat, mat_t])
mat = Dense(kernels)(mat)#, [0, 2, 1]
mat = Dense(1)(mat)
mat = mat + wide
model = Model([user_i, replies_user_i, user_tweets_bert,
              target_i, replies_target_i, target_tweets_bert], mat)

model.summary()

model.compile(loss=loss, optimizer='adam')

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_list (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
user_tweets_bert (InputLayer)   [(None, None, 768)]  0                                            
_______________________________________________________________________

In [8]:
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
import os
if not os.path.exists('connected-neg-replies'):
    os.makedirs('connected-neg-replies')

def scheduler(epoch, lr):
    if epoch <= 2:
        return lr
    return lr * tf.math.exp(-0.1)


model.fit(dataset, epochs=4, callbacks=[ModelCheckpoint(filepath='connected-neg-replies/{epoch:02d}-weights-neg-{loss:.5f}.hdf5',
                                                        monitor='loss',
                                                        save_best_only=False),
                                         LearningRateScheduler(scheduler)], workers=1)

Epoch 1/4
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x2373f3d36a0>

In [9]:
model.save_weights('connected-neg-replies/model_rec-neg.h5')