In [1]:
"""
思路: 在TR模型的基础上, 融合用户的文本特征, 以异质图Heter-GAT的方式融合用户特征, 做信息传播预测任务
方法:
1. 整理原始数据, 构建用户有向关联网络, 并根据原始文本内容计算用户文本嵌入向量
2. 考虑节点类型为User和Tweet, 边类型为U-U和U-T, 分别从用户特征和文本特征的角度通过GAT网络融合邻域节点特征;
   Heter-GAT模型的输出为(N,|Rs|+1,D')维度, 模型后面需要接一个全连接层FC=(|Rs|+1)*D'->2, 损失函数保持为NLL-Loss
3. 可视化局部邻域, 观察不同注意力头、不同异质图邻域卷积的偏向
"""

"\n思路: 在TR模型的基础上, 融合用户的文本特征, 以异质图Heter-GAT的方式融合用户特征, 做信息传播预测任务\n方法:\n1. 整理原始数据, 构建用户有向关联网络, 并根据原始文本内容计算用户文本嵌入向量\n2. 考虑节点类型为User和Tweet, 边类型为U-U和U-T, 分别从用户特征和文本特征的角度通过GAT网络融合邻域节点特征;\n   Heter-GAT模型的输出为(N,|Rs|+1,D')维度, 模型后面需要接一个全连接层FC=(|Rs|+1)*D'->2, 损失函数保持为NLL-Loss\n3. 可视化局部邻域, 观察不同注意力头、不同异质图邻域卷积的偏向\n"

In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from lib.log import logger
from utils import load_pickle, save_pickle, ChunkSampler, SubGraphSample, load_w2v_feature
from model import BatchdenseGAT, HeterdenseGAT
import numpy as np
import copy
import time
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, precision_recall_curve
from tensorboard_logger import tensorboard_logger
from torch.utils.data import Dataset
import configparser

config = configparser.ConfigParser()
config.read('config.ini')
DATA_ROOTPATH = config['DEFAULT']['DataRootPath']
logger.info(f"Reading From config.ini... DATA_ROOTPATH={DATA_ROOTPATH}")

2022-10-25 15:19:52,519 Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-10-25 15:19:52,520 NumExpr defaulting to 8 threads.
2022-10-25 07:19:53.304131: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2022-10-25 15:19:54,658 Reading From config.ini... DATA_ROOTPATH=/remote-home/share/dmb_nas/wangzejian


In [2]:
tweet_feats = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/doc2topic_tweetfeat.p"))

In [5]:
tweet_feats.shape[1]

20

In [6]:
vertex_ids = load_pickle(os.path.join("/remote-home/share/dmb_nas/wangzejian/HeterGAT/stages/hs_subg483_inf_40_1718027_deg_18_483_ego_20_neg_1_restart_20", "extended_vertices.p"))

In [12]:
tweet_feats

array([[0.20903095, 0.00625   , 0.44792733, ..., 0.00625   , 0.00625   ,
        0.00625   ],
       [0.005     , 0.005     , 0.48467028, ..., 0.005     , 0.005     ,
        0.005     ],
       [0.00714286, 0.00714286, 0.53198843, ..., 0.00714286, 0.00714286,
        0.00714286],
       ...,
       [0.00384615, 0.00384615, 0.92692308, ..., 0.00384615, 0.00384615,
        0.00384615],
       [0.00454545, 0.00454545, 0.79811734, ..., 0.00454545, 0.00454545,
        0.00454545],
       [0.00454545, 0.00454545, 0.64677627, ..., 0.00454545, 0.00454545,
        0.00454545]])

In [13]:
tweet_feats[vertex_ids[:,20:]].shape

(58185, 100, 20)

In [5]:
# def gen_random_tweet_ids(samples: SubGraphSample, outdir: str, tweets_per_user:int=5):
#     tweet_ids = []
#     sample_ids = []
#     ut_mp = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/text/utmp_groupbystage.p"))

#     for idx in range(len(samples.labels)):
#         if idx and idx % 10000 == 0:
#             logger.info(f"idx={idx}, sample_ids={len(sample_ids)}, tweet_ids={len(tweet_ids)}")
#         stage = samples.time_stages[idx]
#         selected_tweet_ids  = set()
#         candidate_tweet_ids = set()
#         for vertex_id in samples.vertex_ids[idx]:
#             available_tweet_ids = ut_mp[stage][vertex_id]
#             random_ids = np.random.choice(available_tweet_ids, size=min(tweets_per_user, len(available_tweet_ids)), replace=False)
#             selected_tweet_ids  |= set(random_ids)
#             candidate_tweet_ids |= set(available_tweet_ids)-set(random_ids)
#         candidate_tweet_ids -= selected_tweet_ids
#         # logger.info(f"Length: sample={len(selected_tweet_ids)}, remain={len(candidate_tweet_ids)}, expected={len(samples.vertex_ids[idx])*tweets_per_user}")

#         if len(selected_tweet_ids) != len(samples.vertex_ids[idx])*tweets_per_user:
#             diff = len(samples.vertex_ids[idx])*tweets_per_user - len(selected_tweet_ids)
#             if diff > len(candidate_tweet_ids):
#                 continue
#             selected_tweet_ids |= set(np.random.choice(list(candidate_tweet_ids), size=diff, replace=False))
#         sample_ids.append(idx)
#         tweet_ids.append(selected_tweet_ids)
#     logger.info(f"Finish Sampling Random Tweets... sample_ids={len(sample_ids)}, tweet_ids={len(tweet_ids)}")

#     os.makedirs(outdir, exist_ok=True)
#     selected_samples = SubGraphSample(
#         adj_matrices=samples.adj_matrices[sample_ids],
#         influence_features=samples.influence_features[sample_ids],
#         vertex_ids=samples.vertex_ids[sample_ids],
#         labels=samples.labels[sample_ids],
#         tags=samples.tags[sample_ids],
#         time_stages=samples.time_stages[sample_ids],
#     )
#     save_pickle(sample_ids, os.path.join(outdir, "sample_ids.p"))
#     save_pickle(tweet_ids, os.path.join(outdir, "tweet_ids.p"))
#     save_pickle(selected_samples, os.path.join(outdir, "selected_samples.p"))
#     logger.info("Finish Saving pkl...")

# def extend_subnetwork(file_dir: str):
#     hs_filedir = os.path.join(DATA_ROOTPATH, file_dir).replace('stages_', 'hs_')
#     samples = load_pickle(os.path.join(hs_filedir, "selected_samples.p"))
#     tweet_ids = load_pickle(os.path.join(hs_filedir, "tweet_ids.p"))
#     assert len(samples) == len(tweet_ids)

#     tweetid2userid_mp = load_pickle(os.path.join(DATA_ROOTPATH, "HeterGAT/basic/text/tweetid2userid_mp.p"))
#     vertex_ids = samples.vertex_ids
#     adjs       = samples.adj_matrices
#     adjs[adjs != 0] = 1.0
#     adjs = adjs.astype(np.dtype('B'))

#     extended_vertices, extended_adjs = [], []
#     for idx in range(len(samples)):
#         subnetwork = np.array(np.concatenate((vertex_ids[idx], np.array(list(tweet_ids[idx])))), dtype=int)
#         extended_vertices.append(subnetwork)

#         subnetwork_size, num_users = len(subnetwork), len(vertex_ids[idx])
#         elem_idx_mp = {elem:idx for idx,elem in enumerate(subnetwork)}
#         uu_adj = np.array([[0]*subnetwork_size for _ in range(subnetwork_size)], dtype='B')
#         uu_adj[:num_users,:num_users] = adjs[idx]
#         # NOTE: Get Corresponding User_id By Tweet_id, and then convert them into indexes in extend_subnetwork
#         ut_adj = copy.deepcopy(uu_adj)
#         for tweet_id in tweet_ids[idx]:
#             user_id = tweetid2userid_mp[tweet_id]
#             net_userid = elem_idx_mp[user_id]
#             net_tweetid = elem_idx_mp[tweet_id]
#             ut_adj[net_userid][net_tweetid] = 1
#         extended_adjs.append([uu_adj, ut_adj])
#     extended_vertices, extended_adjs = np.array(extended_vertices), np.array(extended_adjs)
#     save_pickle(extended_vertices, os.path.join(hs_filedir, "extended_vertices.p"))
#     save_pickle(extended_adjs, os.path.join(hs_filedir, "extended_adjs.p"))

# data_dirpath = os.path.join(DATA_ROOTPATH, "HeterGAT/stages/stages_subg483_inf_40_1718027_deg_18_483_ego_20_neg_1_restart_20/")
# samples = SubGraphSample(
#     adj_matrices=np.load(os.path.join(data_dirpath, "adjacency_matrix.npy")),
#     influence_features=np.load(os.path.join(data_dirpath, "influence_feature.npy")),
#     vertex_ids=np.load(os.path.join(data_dirpath, "vertex_id.npy")),
#     labels=np.load(os.path.join(data_dirpath, "label.npy")),
#     tags=np.load(os.path.join(data_dirpath, "hashtag.npy")),
#     time_stages=np.load(os.path.join(data_dirpath, "stage.npy"))
# )
# # gen_random_tweet_ids(samples, os.path.join(DATA_ROOTPATH, "HeterGAT/stages/hs_subg483_inf_40_1718027_deg_18_483_ego_20_neg_1_restart_20/"))
# # extend_subnetwork("HeterGAT/stages/stages_subg483_inf_40_1718027_deg_18_483_ego_20_neg_1_restart_20/")


In [None]:
# def gen_user_emb(tot_user_num):
#     # 208894*200*8*3
#     user_feats = [[0.]*200*8*3 for _ in range(tot_user_num)]
#     for tag in range(200):
#         logger.info(f"tag={tag}")
#         for stage in range(8):
#             for feats_idx, feats in enumerate(["norm_gravity_feature", "norm_exptime_feature1", "norm_ce_feature"]):
#                 feats = load_pickle(f"/root/data/HeterGAT/user_features/{feats}/hashtag{tag}_t{stage}.p")
#                 for idx in range(tot_user_num):
#                     user_feats[idx][tag*3*8+stage*3+feats_idx] = float(feats[idx])
#     logger.info(f"shape={user_feats.shape}")
#     return torch.FloatTensor(user_feats)

# user_emb = gen_user_emb(208894)

In [66]:
# NOTE: Fake Digg Heter Dataset
from torch.utils.data import Dataset
from utils import SubGraphSample, load_w2v_feature

class DiggDataset(Dataset):
    def __init__(self, samples: SubGraphSample, embedding) -> None:
        super().__init__()
        self.adjs = samples.adj_matrices
        self.labels = samples.labels
        self.feats = samples.influence_features
        self.vertex_ids = samples.vertex_ids
        self.concact_feats(embedding)
    def concact_feats(self, embedding):
        feats = []
        for idx, vertex_ids in enumerate(self.vertex_ids):
            emb_feats = [embedding[user] for user in vertex_ids]
            feats.append(np.concatenate((self.feats[idx], emb_feats), axis=1))
        self.feats = np.array(feats)
        logger.info(self.feats.shape)
    def __len__(self):
        return self.labels.shape[0]
    def __getitem__(self, index):
        return self.adjs[index], self.labels[index], self.feats[index]

def collate_fn2(batch:list): 
    """
    Collate function which to transform scipy coo matrix to pytorch sparse tensor
    """
    adjs_batch, labels_batch, feats_batch = zip(*batch)
    adjs_batch = torch.FloatTensor(np.array(adjs_batch))
    
    if type(labels_batch[0]).__module__ == 'numpy':
        # NOTE: https://stackoverflow.com/questions/69742930/runtimeerror-nll-loss-forward-reduce-cuda-kernel-2d-index-not-implemented-for
        labels_batch = torch.LongTensor(labels_batch)
    
    if type(feats_batch[0]).__module__ == 'numpy':
        feats_batch = torch.FloatTensor(np.array(feats_batch))
    return adjs_batch, labels_batch, feats_batch

def digg_load_dataset(train_ratio=60, valid_ratio=20, batch_size=256):
    embedding_path = "/root/Lab_Related/data/Heter-GAT/Classic/deepwalk/deepwalk_added.emb_64"
    vertices = np.load("/root/TR-pptusn/DeepInf-preprocess/preprocess/stages_op_inf_100_1k/vertex_id.npy")
    max_vertex_idx = np.max(vertices)
    embedding = load_w2v_feature(embedding_path, max_vertex_idx)
    # embedding = torch.FloatTensor(embedding)

    samples = SubGraphSample(
        adj_matrices=np.load("/root/TR-pptusn/DeepInf-preprocess/preprocess/stages_op_inf_100_1k/adjacency_matrix.npy"),
        influence_features=np.load("/root/TR-pptusn/DeepInf-preprocess/preprocess/stages_op_inf_100_1k/influence_feature.npy"),
        vertex_ids=np.load("/root/TR-pptusn/DeepInf-preprocess/preprocess/stages_op_inf_100_1k/vertex_id.npy"),
        labels=np.load("/root/TR-pptusn/DeepInf-preprocess/preprocess/stages_op_inf_100_1k/label.npy")
    )
    dataset = DiggDataset(samples, embedding)
    nb_samples    = len(dataset)
    
    train_start,  valid_start, test_start = 0, int(nb_samples*train_ratio/100), int(nb_samples*(train_ratio+valid_ratio)/100)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=ChunkSampler(valid_start-train_start, 0), collate_fn=collate_fn2)
    valid_loader = DataLoader(dataset, batch_size=batch_size, sampler=ChunkSampler(test_start-valid_start, valid_start), collate_fn=collate_fn2)
    test_loader  = DataLoader(dataset, batch_size=batch_size, sampler=ChunkSampler(nb_samples - test_start, test_start), collate_fn=collate_fn2)
    logger.info(f"Finish Loading Dataset... train={len(train_loader)}, valid={len(valid_loader)}, test={len(test_loader)}")

    return samples, train_loader, valid_loader, test_loader