In [1]:
# Higgs
# edge_filepath = "/remote-home/share/dmb_nas/wangzejian/TR_TNSE/higgs_twitter/higgs-social_network.edgelist"
# actionlog_filepath = "/remote-home/share/dmb_nas/wangzejian/TR_TNSE/higgs_twitter/higgs_gen-actionlog-300.txt"

# Munmun
# edge_filepath = "/remote-home/share/dmb_nas/wangzejian/TR_TNSE/munmun_twitter_social/munmun_twitter_social.edges"
# actionlog_filepath = "/remote-home/share/dmb_nas/wangzejian/TR_TNSE/munmun_twitter_social/munmun_gen-actionlog-300.txt"

# Virality2013
edge_filepath = "/remote-home/share/dmb_nas/wangzejian/TR_TNSE/virality2013_twitter/follower_gcc.anony.txt"
actionlog_filepath = "/remote-home/share/dmb_nas/wangzejian/TR_TNSE/virality2013_twitter/timeline_tag.anony.txt"

In [2]:
import datetime
import logging
import pickle
from sklearn.model_selection import train_test_split
import random

# logging config
def Beijing_TimeZone_Converter(sec, what):
    beijing_time = datetime.datetime.now() + datetime.timedelta(hours=8)
    return beijing_time.timetuple()

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') # include timestamp
# logging.Formatter.converter = time.gmtime
logging.Formatter.converter = Beijing_TimeZone_Converter

def add_diffusion(diffusion_dict: dict, user, diffusion, timestamp):
    if diffusion not in diffusion_dict:
        diffusion_dict[diffusion] = []
    diffusion_dict[diffusion].append((user, timestamp))

def build_actionlog(actionlog_filepath: str, sep=','):
    diffusion_dict = {}
    with open(actionlog_filepath, "r") as f:
        for line in f:
            comps = line[:-1].split(',')
            hashtag = comps[0]
            for i in range(1, len(comps), 2):
                timestamp, user_id = comps[i], comps[i+1]
                add_diffusion(diffusion_dict, user_id, hashtag, int(timestamp))
    # with open(actionlog_filepath, "r") as f:
    #     for line in f:
    #         user_id, hashtag, timestamp = line[:-1].split(sep)
    #         add_diffusion(diffusion_dict, user_id, hashtag, int(timestamp))
    logger.info(f"Load ActionLog from File {actionlog_filepath}, ActionLog is Composed of {len(diffusion_dict)} Diffusions")

    # Sort by Timestamp
    for diffusion in diffusion_dict:
        diffusion_dict[diffusion] = sorted(diffusion_dict[diffusion], key = lambda item: int(item[1]))
    logger.info("Sort ActionLog by Timestamp")

    return diffusion_dict

def get_avail_users(diffusion_dict: dict):
    user_s = set()
    for key, items in diffusion_dict.items():
        for item in items: user_s.add(item[0])
    logger.info(f"Get {len(user_s)} Available Users")
    return user_s

# diffusion_dict = build_actionlog(actionlog_filepath, sep=',')
diffusion_dict = build_actionlog(actionlog_filepath, sep=' ')
user_s = get_avail_users(diffusion_dict)

diffusion_dict = {k: v for k,v in diffusion_dict.items() if 10 <= len(v) <= 200}

train_diffusion_dict, test_diffusion_dict = train_test_split(
    list(diffusion_dict.keys()), test_size=0.2, random_state=42)

train_diffusion_dict = {diffusion: diffusion_dict[diffusion] for diffusion in train_diffusion_dict}
test_diffusion_dict = {diffusion: diffusion_dict[diffusion] for diffusion in test_diffusion_dict}
logger.info(f"Split Train Diffusion {len(train_diffusion_dict)} and Test Diffusion {len(test_diffusion_dict)}")

2024-04-05 21:52:22,510 Load ActionLog from File /remote-home/share/dmb_nas/wangzejian/TR_TNSE/virality2013_twitter/timeline_tag.anony.txt, ActionLog is Composed of 1345913 Diffusions
2024-04-05 21:52:25,611 Sort ActionLog by Timestamp
2024-04-05 21:52:29,815 Get 510795 Available Users
2024-04-05 21:52:30,908 Split Train Diffusion 61070 and Test Diffusion 15268


In [3]:
import networkx as nx
import random

def read_network(filename):
    G = nx.DiGraph()
    with open(filename, 'r') as f:
        for line in f:
            if line[0] == '%':
                continue
            from_, to_ = line[:-1].split(',')[:2]
            if from_ not in G.nodes():
                G.add_node(
                    from_,
                    threshold = random.uniform(0,1),
                    # influencesum = 0
                )
            if to_ not in G.nodes():
                G.add_node(
                    to_,
                    threshold = random.uniform(0,1),
                    # influencesum = 0
                )
            G.add_edge(from_, to_)

    # compute sum of all its incoming node's out-degree for each node, 
    # out_degree_sum_map = {}
    # for u in G.nodes():
    #     sum = 0
    #     for in_node, _ in G.in_edges(u):
    #         sum += G.out_degree(in_node)
    #     out_degree_sum_map[u] = sum
    
    # for (u, v) in G.edges:
    #     G.edges[u, v]["weight"] = float(theta1 + theta2 * G.out_degree(u)) / out_degree_sum_map[v]
    return G

g = read_network(edge_filepath)

In [15]:
from tqdm import tqdm
import time

a_u = {}
a_u2v = {}
delta_t_u2v = {}
credit_u2v = {}

start_time = time.time()

for key, items in tqdm(train_diffusion_dict.items()):
    # if len(items) < 10: continue

    for i in range(0, len(items)):
        user, ts = items[i][0], items[i][1]
        if user not in a_u: a_u[user] = 0
        a_u[user] += 1

        for last_item in items[:i]:
            last_user = last_item[0]
            # if g[last_user][user] == 0: continue
            if (last_user, user) not in a_u2v: a_u2v[(last_user, user)] = 0
            a_u2v[(last_user, user)] += 1

            if (last_user, user) not in delta_t_u2v: delta_t_u2v[(last_user, user)] = 0
            delta_t_u2v[(last_user, user)] += ts - last_item[1]

            if (last_user, user) not in credit_u2v: credit_u2v[(last_user, user)] = 0
            credit_u2v[(last_user, user)] += 1/(i+1)

logger.info("Total Training Time: {}".format(time.time()-start_time))

  0%|          | 0/61070 [00:00<?, ?it/s]

100%|██████████| 61070/61070 [02:48<00:00, 362.89it/s]
2024-04-05 20:32:12,484 Total Training Time: 168.29227209091187


In [16]:
### CT
import numpy as np
import time

def safe_divide(a, b, eps=1e-6):
    return a / (b + eps)

def p_u2v(u, v, option=0):
    try:
        if option == 0: return safe_divide(a_u2v[(u, v)], a_u[u])
        elif option == 1: return safe_divide(a_u2v[(u, v)], (a_u[u] + a_u[v] - a_u2v[(u, v)] - a_u2v[(v, u)]))
        elif option == 2: return safe_divide(credit_u2v[(u, v)], a_u[u])
        elif option == 3: return safe_divide(credit_u2v[(u,v)], (a_u[u] + a_u[v] - a_u2v[(u, v)] - a_u2v[(v, u)]))
        else: raise Exception("Invalid Option")
    except:
        return 0

start_time = time.time()

for key, items in tqdm(test_diffusion_dict.items()):

    y_trues = {}
    y_preds = {}
    ans_list = []
    u2ts = {}

    # us = set()
    for user, ts in items:
        if user not in y_preds:
            y_preds[user] = 0
            y_trues[user] = 2
            ans_list.append((user, ts))
            u2ts[user] = ts
        else:
            y_trues[user] = 1
        
        if user not in g.nodes: continue
        for u in g.successors(user):
            if u not in y_preds:
                y_preds[u] = 0
                y_trues[u] = 0
                ans_list.append((u, -1))
                u2ts[u] = -1

    for user, ts in ans_list:
        if user not in g.nodes: continue
        acc_p = 0
        for pa in g.predecessors(user):
            if pa not in y_trues or pa not in u2ts: continue
            if (pa, user) not in delta_t_u2v: continue
            if y_trues[pa] != 1: continue
            acc_p += p_u2v(pa, user, option=0) * np.exp(-safe_divide(ts-u2ts[pa], delta_t_u2v[(pa, user)]))
        y_preds[user] = max(y_preds[user], acc_p)
    
    tp, fp, tn, fn = 0, 0, 0, 0
    for user, ts in ans_list:
        if user not in y_preds or not g.has_node(user): continue
        try:
            if y_trues[user] == 1 and y_preds[user] >= g.nodes[user]["threshold"]: tp += 1
            elif y_trues[user] == 1 and y_preds[user] < g.nodes[user]["threshold"]: fn += 1
            elif y_trues[user] == 0 and y_preds[user] >= g.nodes[user]["threshold"]: fp += 1
            elif y_trues[user] == 0 and y_preds[user] < g.nodes[user]["threshold"]: tn += 1
        except:
            print(g.nodes[user])
    
    # logger.info("tag: {}, TP: {}, FP: {}, TN: {}, FN: {}".format(key, tp, fp, tn, fn))
    
logger.info("Total Time: {}".format(time.time()-start_time))

  acc_p += p_u2v(pa, user, option=0) * np.exp(-safe_divide(ts-u2ts[pa], delta_t_u2v[(pa, user)]))
 29%|██▉       | 4415/15268 [02:04<04:39, 38.85it/s]

In [6]:
### DT
import numpy as np
import time

def safe_divide(a, b, eps=1e-6):
    return a / (b + eps)

def p_u2v(u, v, option=0):
    try:
        if option == 0: return safe_divide(a_u2v[(u, v)], a_u[u])
        elif option == 1: return safe_divide(a_u2v[(u, v)], (a_u[u] + a_u[v] - a_u2v[(u, v)] - a_u2v[(v, u)]))
        elif option == 2: return safe_divide(credit_u2v[(u, v)], a_u[u])
        elif option == 3: return safe_divide(credit_u2v[(u,v)], (a_u[u] + a_u[v] - a_u2v[(u, v)] - a_u2v[(v, u)]))
        else: raise Exception("Invalid Option")
    except:
        return 0

start_time = time.time()

for key, items in tqdm(test_diffusion_dict.items()):

    y_trues = {}
    y_preds = {}
    ans_list = []
    u2ts = {}

    # us = set()
    for user, ts in items:
        if user not in y_preds:
            y_preds[user] = 0
            y_trues[user] = 2
            ans_list.append((user, ts))
            u2ts[user] = ts
        else:
            y_trues[user] = 1
        
        if user not in g.nodes: continue
        for u in g.successors(user):
            if u not in y_preds:
                y_preds[u] = 0
                y_trues[u] = 0
                # ans_list.append((u, -1))
                u2ts[u] = -1
            
            acc_p = 0
            for pa in g.predecessors(user):
                if pa not in y_trues or pa not in u2ts: continue
                if (pa, user) not in delta_t_u2v: continue
                if y_trues[pa] != 1: continue
                if ts - u2ts[pa] > delta_t_u2v[(pa, user)]: continue
                acc_p += p_u2v(pa, user, option=0)
            y_preds[user] = max(y_preds[user], acc_p)

    for user, ts in ans_list:
        if user not in g.nodes: continue
        acc_p = 0
        for pa in g.predecessors(user):
            if pa not in y_trues or pa not in u2ts: continue
            if (pa, user) not in delta_t_u2v: continue
            if y_trues[pa] != 1: continue
            acc_p += p_u2v(pa, user, option=0) * np.exp(-safe_divide(ts-u2ts[pa], delta_t_u2v[(pa, user)]))
        y_preds[user] = max(y_preds[user], acc_p)
    
    tp, fp, tn, fn = 0, 0, 0, 0
    for user, ts in ans_list:
        if user not in y_preds or not g.has_node(user): continue
        try:
            if y_trues[user] == 1 and y_preds[user] >= g.nodes[user]["threshold"]: tp += 1
            elif y_trues[user] == 1 and y_preds[user] < g.nodes[user]["threshold"]: fn += 1
            elif y_trues[user] == 0 and y_preds[user] >= g.nodes[user]["threshold"]: fp += 1
            elif y_trues[user] == 0 and y_preds[user] < g.nodes[user]["threshold"]: tn += 1
        except:
            print(g.nodes[user])
    
    # logger.info("tag: {}, TP: {}, FP: {}, TN: {}, FN: {}".format(key, tp, fp, tn, fn))
    
logger.info("Total Time: {}".format(time.time()-start_time))

  0%|          | 0/60 [00:00<?, ?it/s]

100%|██████████| 60/60 [00:24<00:00,  2.44it/s]
2024-04-05 20:09:49,471 Total Time: 24.558175802230835


In [None]:
### GT

import networkx as nx
from tqdm import tqdm
import time

# Global Influence
pr = nx.pagerank(g, alpha=0.85)

# Social Influence
a_u = {}
a_u2v = {}

start_time = time.time()

for key, items in tqdm(train_diffusion_dict.items()):
    # if len(items) < 10: continue

    for i in range(0, len(items)):
        user, ts = items[i][0], items[i][1]
        if user not in a_u: a_u[user] = 0
        a_u[user] += 1

        for last_item in items[:i]:
            last_user = last_item[0]
            # if g[last_user][user] == 0: continue
            if (last_user, user) not in a_u2v: a_u2v[(last_user, user)] = 0
            a_u2v[(last_user, user)] += 1

def safe_divide(a, b, eps=1e-6):
    return a / (b + eps)

def p_u2v(u, v):
    try:
        return safe_divide(a_u2v[(u, v)], a_u[u])
    except:
        return 0

# Preference Payoff

# cascade->user sentences
from sklearn.feature_extraction.text import TfidfVectorizer

def get_ifidf_for_words(sentences):
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences).todense()

    feature_index = tfidf_matrix[0,:].nonzero()[1]
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)

cascade2users = [[elem[0] for elem in v] for _, v in train_diffusion_dict.items()]
user_embs = get_ifidf_for_words(cascade2users)

user2cascades = {}
for key, items in train_diffusion_dict.items():
    for item in items:
        user = item[0]
        if user not in user2cascades:
            user2cascades[user] = []
        user2cascades[user].append(key)
user2cascades = list(user2cascades.values())
cas_embs = get_ifidf_for_words(user2cascades)

logger.info("Total Training Time: {}".format(time.time()-start_time))

In [None]:
start_time = time.time()

for key, items in tqdm(test_diffusion_dict.items()):

    active_users = set()
    candidates_s = set()
    for user, ts in items:
        active_users.add(user)
        if user not in g.nodes: continue
        for u in g.successors(user):
            candidates_s.add(u)
    
    for c in candidates_s:
        soc_pos, soc_neg = 0, 0

        for u in g.predecessors(c):
            if u in active_users:
                soc_pos += p_u2v(u, c) * pr[u]
            else:
                soc_neg += p_u2v(u, c) * pr[u]
        
        if key in cas_embs and c in user_embs:
            soc_pos += cas_embs[key] * user_embs[c]

        tp, fp, tn, fn = 0, 0, 0, 0
        if c in active_users and soc_pos >= soc_neg: tp += 1
        elif c in active_users and soc_pos < soc_neg: fn += 1
        elif c not in active_users and soc_pos >= soc_neg: fp += 1
        elif c not in active_users and soc_pos < soc_neg: tn += 1
    
    # logger.info("tag: {}, TP: {}, FP: {}, TN: {}, FN: {}".format(key, tp, fp, tn, fn))
    
logger.info("Total Testing Time: {}".format(time.time()-start_time))