In [1]:
"""
思路: 在TR模型的基础上, 融合用户的文本特征, 以异质图Heter-GAT的方式融合用户特征, 做信息传播预测任务
方法:
1. 整理原始数据, 构建用户有向关联网络, 并根据原始文本内容计算用户文本嵌入向量
2. 考虑节点类型为User和Tweet, 边类型为U-U和U-T, 分别从用户特征和文本特征的角度通过GAT网络融合邻域节点特征;
   Heter-GAT模型的输出为(N,|Rs|+1,D')维度, 模型后面需要接一个全连接层FC=(|Rs|+1)*D'->2, 损失函数保持为NLL-Loss
3. 可视化局部邻域, 观察不同注意力头、不同异质图邻域卷积的偏向
"""

"\n思路: 在TR模型的基础上, 融合用户的文本特征, 以异质图Heter-GAT的方式融合用户特征, 做信息传播预测任务\n方法:\n1. 整理原始数据, 构建用户有向关联网络, 并根据原始文本内容计算用户文本嵌入向量\n2. 考虑节点类型为User和Tweet, 边类型为U-U和U-T, 分别从用户特征和文本特征的角度通过GAT网络融合邻域节点特征;\n   Heter-GAT模型的输出为(N,|Rs|+1,D')维度, 模型后面需要接一个全连接层FC=(|Rs|+1)*D'->2, 损失函数保持为NLL-Loss\n3. 可视化局部邻域, 观察不同注意力头、不同异质图邻域卷积的偏向\n"

In [4]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from lib.log import logger
from lib.utils import get_sparse_tensor
from utils import load_pickle, save_pickle, ChunkSampler, SubGraphSample, load_w2v_feature, sample_tweets_around_user, summarize_distribution
from model import BatchdenseGAT, HeterdenseGAT
import numpy as np
import copy
import time
from scipy import sparse
from scipy import io as sio
import random
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, precision_recall_curve
from tensorboard_logger import tensorboard_logger
from torch.utils.data import Dataset
import configparser
from dgl.data.utils import download, get_download_dir, _get_dgl_url
from pprint import pprint
import datetime
import dgl
import errno
import pickle

config = configparser.ConfigParser()
config.read('config.ini')
DATA_ROOTPATH = config['DEFAULT']['DataRootPath']
logger.info(f"Reading From config.ini... DATA_ROOTPATH={DATA_ROOTPATH}")

2022-11-04 10:11:03,676 Reading From config.ini... DATA_ROOTPATH=/remote-home/share/dmb_nas/wangzejian/


In [2]:
df = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/deg_le483_df.p"))


In [6]:
l = [len(elem) for elem in df.values()]
sorted(l)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 4,
 4,
 5,
 7,
 8,
 8,
 9,
 22,
 32,
 33,
 40,
 43,
 47,
 69,
 71,
 80,
 97,
 99,
 111,
 126,
 129,
 140,
 148,
 151,
 158,
 164,
 183,
 189,
 190,
 197,
 200,
 203,
 206,
 210,
 213,
 263,
 270,
 305,
 322,
 329,
 389,
 389,
 404,
 425,
 453,
 478,
 523,
 538,
 543,
 623,
 700,
 743,
 764,
 778,
 812,
 819,
 862,
 1020,
 1051,
 1068,
 1163,
 1310,
 1350,
 1399,
 1411,
 1672,
 1914,
 2053,
 2147,
 2148,
 2344,
 2502,
 3004,
 3077,
 3092,
 3298,
 3372,
 3396,
 3440,
 5437,
 5680,
 6177,
 7603,
 8451,
 8523,
 10211,
 10616,
 10749,
 10995,
 13013,
 13913,
 14633,
 15504,
 15979,
 18446,
 20835,
 21219,
 23519,
 28042,
 30099,
 30572,
 31002,
 33628,
 34963,
 36865,
 38463,
 40179,
 59448,
 59844,
 62997,
 66266,
 67259,

In [None]:
def get_binary_mask(total_size, indices):
    mask = torch.zeros(total_size)
    mask[indices] = 1
    return mask.byte()

def load_labels(args, hashtag, g, df):
    labels = gen_labels(hashtag=hashtag, g=g, df=df)

    float_mask = np.zeros(len(labels))
    for label in [-1,1]:
        ids = np.where(labels == label)[0]
        if args.shuffle:
            float_mask[ids] = np.random.permutation(np.linspace(1e-10,1,len(ids)))
        else:
            float_mask[ids] = np.linspace(1e-10,1,len(ids))
    
    train_ids = np.where((float_mask>0) & (float_mask<=args.train_ratio/100))[0]
    val_ids   = np.where((float_mask>args.train_ratio/100) & (float_mask<=(args.train_ratio+args.valid_ratio)/100))[0]
    test_ids  = np.where(float_mask>(args.train_ratio+args.valid_ratio)/100)[0]
    logger.info(f"train/valid/test={len(train_ids)},{len(val_ids)},{len(test_ids)}")

    num_user = g.vcount()
    train_mask = get_binary_mask(num_user, train_ids)
    val_mask   = get_binary_mask(num_user, val_ids)
    test_mask  = get_binary_mask(num_user, test_ids)
    if hasattr(torch, 'BoolTensor'):
        train_mask = train_mask.bool()
        val_mask = val_mask.bool()
        test_mask = test_mask.bool()
    
    labels[labels==-1] = 0
    nb_classes = np.unique(labels).shape[0]
    class_weight = torch.FloatTensor(len(labels) / (nb_classes*np.bincount(labels))) if args.class_weight_balanced else torch.ones(nb_classes)
    
    return labels, train_mask, val_mask, test_mask, nb_classes, class_weight


In [15]:
for hashtag in df:
    logger.info(hashtag)
    break

2022-11-03 14:17:25,233 186


In [None]:
graph_vcount = g.vcount()
stage = 7

labels = torch.zeros(graph_vcount)
labels[list(pos_users[stage])] =  1
arr1 = labels.int().numpy()
logger.info(np.bincount(arr1))

labels[list(neg_users[stage])] = -1

arr1 = labels.int().numpy()
logger.info(np.bincount(np.where(arr1==-1, 2, arr1)))

In [3]:
arr1 = labels.int().numpy()
np.bincount(np.where(arr1==-1, 2, arr1))

array([  957,   147, 43792])

In [40]:
# NOTE: tweets group by hashtag

graphid2tag = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/text/graphid2tag.p"))

ts = [set() for _ in range(200)]
for gid, tag in graphid2tag.items():
    ts[tag].add(gid)


In [None]:
# # NOTE: 目标是构建Hadjs和Feats, 同时生成必要的labels(for-loss)

# # Total 44896 User Nodes
# subgraph_deg483 = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/deg_le483_subgraph.p"))

# # Find Tweet Nodes for each User Node
# #   P.S. Dont Use Sampling, since we are not constructing subnetworks
# # TODO: use text/utmp_groupbystage.p instead

# ut_mp = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/usertweet_mp.p"))

# # NOTE: Choose Part of the Tweets, 1kw is toooooo large!
# max_user_tweets = 40
# tweet_nodes = []
# ut_edges    = []
# for user in subgraph_deg483.vs["label"]:
#     selected_tweets = random.choices(ut_mp[user], k=min(len(ut_mp[user]), max_user_tweets))
#     tweet_nodes.extend(selected_tweets)
#     for tweet in selected_tweets:
#         ut_edges.append((user, tweet))
# logger.info(f"Tweet Nodes={len(tweet_nodes)}, Edges={len(ut_edges)}")

# nodes = {}
# node_indices = 0
# # Users: 44896, Tweets: 10008103, Total: 10052999
# for node in subgraph_deg483.vs["label"]+[tweet+208894 for tweet in tweet_nodes]:
#     nodes[node] = node_indices
#     node_indices += 1

# edges = [[], []]
# for uu_edge in subgraph_deg483.es:
#     source, target = subgraph_deg483.vs[uu_edge.source]["label"], subgraph_deg483.vs[uu_edge.target]["label"]
#     edges[0].append([nodes[source], nodes[target]])

# for from_, to_ in ut_edges:
#     edges[1].append([nodes[from_], nodes[to_+208894]])

# # Add self-loops
# for node in range(len(subgraph_deg483.vs["label"])):
#     edges[0].append([node, node])
# for node in range(len(subgraph_deg483.vs["label"]), node_indices):
#     edges[1].append([node,node])

# logger.info(f"{len(edges[0])}, {len(edges[1])}")
# # 2022-10-27 11:13:49,333 480540, 10008103
# # 2022-10-27 11:14:40,915 525436, 20016206
# # NOTE: 2022-10-27 13:54:47,112 525436, 7320540

# def create_sparsemat_from_edgelist(edgelist, m, n):
#     rows, cols = edgelist[:,0], edgelist[:,1]
#     ones = np.ones(len(rows), np.uint8)
#     mat = sparse.coo_matrix((ones, (rows, cols)), shape=(m, n))
#     return mat.tocsr()

# uu_mat = create_sparsemat_from_edgelist(np.array(edges[0]), node_indices, node_indices)
# ut_mat = create_sparsemat_from_edgelist(np.array(edges[1]), node_indices, node_indices)
# hadjs = [uu_mat, ut_mat]
# save_pickle(hadjs, os.path.join(DATA_ROOTPATH, f"HeterGAT/basic/deg_le483_hadjs_selfloop_max{max_user_tweets}tweet.p"))

# # TODO: vertices, stages, tags -> user_features[*]
# # user_features = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/user_features/user_features.p"))

# user_features = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/user_features/user_features_avg.p"))
# deepwalk_feats = load_w2v_feature(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/deepwalk/deepwalk_added.emb_64"), 208894)
# tweet_features = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/doc2topic_tweetfeat.p"))
# user_feats = np.concatenate((user_features[subgraph_deg483.vs["label"]], deepwalk_feats[subgraph_deg483.vs["label"]]), axis=1) 
# tweet_feats = tweet_features[tweet_nodes]
# # logger.info(f"{user_feats.shape}, {tweet_feats.shape}")

# feats = np.concatenate((
#     np.append(user_feats, np.zeros(shape=(user_feats.shape[0], tweet_feats.shape[1])),  axis=1), 
#     np.append(np.zeros(shape=(tweet_feats.shape[0], user_feats.shape[1])), tweet_feats, axis=1), 
# ), axis=0)
# logger.info(feats.shape)
# save_pickle(feats, os.path.join(DATA_ROOTPATH, f"HeterGAT/basic/deg_le483_feats_max{max_user_tweets}tweet.p"))

In [5]:
# def gen_random_tweet_ids(samples: SubGraphSample, outdir: str, tweets_per_user:int=5):
#     tweet_ids = []
#     sample_ids = []
#     ut_mp = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/text/utmp_groupbystage.p"))

#     for idx in range(len(samples.labels)):
#         if idx and idx % 10000 == 0:
#             logger.info(f"idx={idx}, sample_ids={len(sample_ids)}, tweet_ids={len(tweet_ids)}")
#         stage = samples.time_stages[idx]
#         selected_tweet_ids  = set()
#         candidate_tweet_ids = set()
#         for vertex_id in samples.vertex_ids[idx]:
#             available_tweet_ids = ut_mp[stage][vertex_id]
#             random_ids = np.random.choice(available_tweet_ids, size=min(tweets_per_user, len(available_tweet_ids)), replace=False)
#             selected_tweet_ids  |= set(random_ids)
#             candidate_tweet_ids |= set(available_tweet_ids)-set(random_ids)
#         candidate_tweet_ids -= selected_tweet_ids
#         # logger.info(f"Length: sample={len(selected_tweet_ids)}, remain={len(candidate_tweet_ids)}, expected={len(samples.vertex_ids[idx])*tweets_per_user}")

#         if len(selected_tweet_ids) != len(samples.vertex_ids[idx])*tweets_per_user:
#             diff = len(samples.vertex_ids[idx])*tweets_per_user - len(selected_tweet_ids)
#             if diff > len(candidate_tweet_ids):
#                 continue
#             selected_tweet_ids |= set(np.random.choice(list(candidate_tweet_ids), size=diff, replace=False))
#         sample_ids.append(idx)
#         tweet_ids.append(selected_tweet_ids)
#     logger.info(f"Finish Sampling Random Tweets... sample_ids={len(sample_ids)}, tweet_ids={len(tweet_ids)}")

#     os.makedirs(outdir, exist_ok=True)
#     selected_samples = SubGraphSample(
#         adj_matrices=samples.adj_matrices[sample_ids],
#         influence_features=samples.influence_features[sample_ids],
#         vertex_ids=samples.vertex_ids[sample_ids],
#         labels=samples.labels[sample_ids],
#         tags=samples.tags[sample_ids],
#         time_stages=samples.time_stages[sample_ids],
#     )
#     save_pickle(sample_ids, os.path.join(outdir, "sample_ids.p"))
#     save_pickle(tweet_ids, os.path.join(outdir, "tweet_ids.p"))
#     save_pickle(selected_samples, os.path.join(outdir, "selected_samples.p"))
#     logger.info("Finish Saving pkl...")

# def extend_subnetwork(file_dir: str):
#     hs_filedir = os.path.join(DATA_ROOTPATH, file_dir).replace('stages_', 'hs_')
#     samples = load_pickle(os.path.join(hs_filedir, "selected_samples.p"))
#     tweet_ids = load_pickle(os.path.join(hs_filedir, "tweet_ids.p"))
#     assert len(samples) == len(tweet_ids)

#     tweetid2userid_mp = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/text/tweetid2userid_mp.p"))
#     vertex_ids = samples.vertex_ids
#     adjs       = samples.adj_matrices
#     adjs[adjs != 0] = 1.0
#     adjs = adjs.astype(np.dtype('B'))

#     extended_vertices, extended_adjs = [], []
#     for idx in range(len(samples)):
#         subnetwork = np.array(np.concatenate((vertex_ids[idx], np.array(list(tweet_ids[idx])))), dtype=int)
#         extended_vertices.append(subnetwork)

#         subnetwork_size, num_users = len(subnetwork), len(vertex_ids[idx])
#         elem_idx_mp = {elem:idx for idx,elem in enumerate(subnetwork)}
#         uu_adj = np.array([[0]*subnetwork_size for _ in range(subnetwork_size)], dtype='B')
#         uu_adj[:num_users,:num_users] = adjs[idx]
#         # NOTE: Get Corresponding User_id By Tweet_id, and then convert them into indexes in extend_subnetwork
#         ut_adj = copy.deepcopy(uu_adj)
#         for tweet_id in tweet_ids[idx]:
#             user_id = tweetid2userid_mp[tweet_id]
#             net_userid = elem_idx_mp[user_id]
#             net_tweetid = elem_idx_mp[tweet_id]
#             ut_adj[net_userid][net_tweetid] = 1
#         extended_adjs.append([uu_adj, ut_adj])
#     extended_vertices, extended_adjs = np.array(extended_vertices), np.array(extended_adjs)
#     save_pickle(extended_vertices, os.path.join(hs_filedir, "extended_vertices.p"))
#     save_pickle(extended_adjs, os.path.join(hs_filedir, "extended_adjs.p"))

# data_dirpath = os.path.join(DATA_ROOTPATH, "HeterGAT/stages/stages_subg483_inf_40_1718027_deg_18_483_ego_20_neg_1_restart_20/")
# samples = SubGraphSample(
#     adj_matrices=np.load(os.path.join(data_dirpath, "adjacency_matrix.npy")),
#     influence_features=np.load(os.path.join(data_dirpath, "influence_feature.npy")),
#     vertex_ids=np.load(os.path.join(data_dirpath, "vertex_id.npy")),
#     labels=np.load(os.path.join(data_dirpath, "label.npy")),
#     tags=np.load(os.path.join(data_dirpath, "hashtag.npy")),
#     time_stages=np.load(os.path.join(data_dirpath, "stage.npy"))
# )
# # gen_random_tweet_ids(samples, os.path.join(DATA_ROOTPATH, "HeterGAT/stages/hs_subg483_inf_40_1718027_deg_18_483_ego_20_neg_1_restart_20/"))
# # extend_subnetwork("HeterGAT/stages/stages_subg483_inf_40_1718027_deg_18_483_ego_20_neg_1_restart_20/")

# def gen_user_emb(tot_user_num):
#     # 208894*200*8*3
#     user_feats = [[0.]*200*8*3 for _ in range(tot_user_num)]
#     for tag in range(200):
#         logger.info(f"tag={tag}")
#         for stage in range(8):
#             for feats_idx, feats in enumerate(["norm_gravity_feature", "norm_exptime_feature1", "norm_ce_feature"]):
#                 feats = load_pickle(f"/root/data/HeterGAT/user_features/{feats}/hashtag{tag}_t{stage}.p")
#                 for idx in range(tot_user_num):
#                     user_feats[idx][tag*3*8+stage*3+feats_idx] = float(feats[idx])
#     logger.info(f"shape={user_feats.shape}")
#     return torch.FloatTensor(user_feats)

# user_emb = gen_user_emb(208894)
