In [1]:
# 原始数据格式分析
#   Initial Feature List (Randomly Initialized)
#       TODO: user profile可以作为User的Initial Feature的若干维度, i.e. Location
#   Heterogeneous Adjacency Matrix <- Load Data
#   Anchor Link Matrix <- Load Data

# Twitter <-> FourSquare
#   Anchor Link Matrix: fsquare-users.pkl.gz id<->twitter
#   NOTE: df["twitter"].dropna(), 有些FourSquare User不存在Twitter Id

# Foursquare Social Network
#   User: fsquare-users.pkl.gz id
#   Tweet: fsquare-tips.pkl.gz text <- 构造text_id
#   Loc: fsquare-locations.pkl.gz id <- 抽象为简单的数字loc_id
#   User-User: fsquare-follows.pkl.gz user1->user2
#   User-Tweet: fsquare-tips.pkl.gz user_id -> text <- text_id
#   User-Loc: fsquare-tips.pkl.gz user_id -> loc_id <- (int)loc_id

# Twitter Social Network
# NOTE: Loc类型的节点是没有的, 只有部分User(~514610/8178957)存在User-Loc(Lat,Lng)的关系
# 意思是只有少部分用户去了某些地方可能会附上定位坐标
# NOTE: 从数据分析来看(见该数据文件夹中的data-format.ipynb), 
# Location指的就是(Lat,Lng)组合, User-Location指的是user-(Lat,Lng)
#   User: twitter-users.pkl.gz username
#   Tweet: twitter-tweets.pkl.gz text <- 构造text_id
#   Loc: twitter-tweets.pkl.gz (lat,lng) <- 去NaN, 去重
#   User-User: twitter-follows.pkl.gz user1 -> user2
#   User-Tweet: twitter-tweets.pkl.gz username -> text <- text_id
#   User-Loc: twitter-tweets.pkl.gz username -> (lat,lng) <- 去NaN

In [2]:
from log import logger
from utils import network_types, node_types, edge_types, get_node_types, extend_edges, create_sparse, get_sparse_tensor, get_anchor_link_matrix, get_train_test_pairs

2022-07-22 19:06:53,861 Note: NumExpr detected 56 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-07-22 19:06:53,863 NumExpr defaulting to 8 threads.


In [3]:
nodes = {}

# Init Var: edges
for network_type in network_types:
    nodes[network_type] = {}
    for node_type in node_types:
        nodes[network_type][node_type] = {}
        with open(f"../output/graph/nodemap-{network_type}-{node_type}.txt", "r") as f:
            for idx, line in enumerate(f):
                if network_type == "Twitter" and node_type == "Tweet" and idx > 50000:
                    break
                if network_type == "Twitter" and node_type == "Location" and idx > 100000:
                    break
                parts = line[:-1].split(' ')
                # NOTE: strange!!! Tweet Nodes Nums are different between r and w
                if len(parts) < 2:
                    continue
                key, value = parts
                if key.isdigit():
                    key = int(key)
                nodes[network_type][node_type][key] = int(value)


In [4]:
edges = {}

# Init Var: edges
for network_type in network_types:
    edges[network_type] = {}
    for edge_type in edge_types:
        edges[network_type][edge_type] = []
        with open(f"../output/graph/edgelist-{network_type}-{edge_type}.txt", "r") as f:
            for line in f:
                from_, to_ = line[:-1].split(' ')
                if network_type == "Twitter" and edge_type == "U-T" and (int(from_) > 50000 or int(to_) > 50000):
                    continue
                if network_type == "Twitter" and edge_type == "U-L" and (int(from_) > 100000 or int(to_) > 100000):
                    continue
                edges[network_type][edge_type].append((from_, to_))


In [5]:
"""
Twitter Nodes: num_users=5223, num_tweets=6960800, num_locs=256497
Twitter Edges: num_U-U=164919, num_U-T=8178952, num_U-L=514610
Foursquare Nodes: num_users=5392, num_tweets=46617, num_locs=94187
Foursquare Edges: num_U-U=76972, num_U-T=48585, num_U-L=48585

Twitter Nodes: num_users=5223, num_tweets=6960800, num_locs=256497
Twitter Nodes: num_users=5223, num_tweets=6960377, num_locs=256497
Twitter Edges: num_U-U=164919, num_U-T=8178952, num_U-L=514610
Foursquare Nodes: num_users=5392, num_tweets=46617, num_locs=94187
Foursquare Nodes: num_users=5392, num_tweets=46616, num_locs=94187
Foursquare Edges: num_U-U=76972, num_U-T=48585, num_U-L=48585
"""
for network_type in network_types:
    num_users  = len(nodes[network_type]["User"])
    num_tweets = len(nodes[network_type]["Tweet"])
    num_locs   = len(nodes[network_type]["Location"])
    print(f"{network_type} Nodes: total={num_users+num_tweets+num_locs}, num_users={num_users}, num_tweets={num_tweets}, num_locs={num_locs}")

    # num_users  = len(new_nodes[network_type]["User"])
    # num_tweets = len(new_nodes[network_type]["Tweet"])
    # num_locs   = len(new_nodes[network_type]["Location"])
    # print(f"{network_type} Nodes: num_users={num_users}, num_tweets={num_tweets}, num_locs={num_locs}")

    num_uu = len(edges[network_type]["U-U"])
    num_ut = len(edges[network_type]["U-T"])
    num_ul = len(edges[network_type]["U-L"])
    print(f"{network_type} Edges: total={num_uu+num_ut+num_ul}, num_U-U={num_uu}, num_U-T={num_ut}, num_U-L={num_ul}")

Twitter Nodes: total=155195, num_users=5223, num_tweets=49971, num_locs=100001
Twitter Edges: total=1353786, num_U-U=164919, num_U-T=899773, num_U-L=289094
Foursquare Nodes: total=146195, num_users=5392, num_tweets=46616, num_locs=94187
Foursquare Edges: total=174142, num_U-U=76972, num_U-T=48585, num_U-L=48585


In [6]:
# NOTE: 所需构造的异构图是(|Rs|, N, N), N=N_user+N_tweet+N_location
# -> 送入第一层次的GAT时, 不同异构图的邻接矩阵不同, 得到的N*D'的emb也不同, 且每次只从中选取(N_user,D')的emb以备后用
# -> 送入第二层次的GAT时, 我们默认N=N_user

matrices = {}

for network_type in network_types:
    matrices[network_type] = {}

    start_idx_mp = {}
    indices = 0
    for node_type in node_types:
        start_idx_mp[node_type] = indices
        indices += len(nodes[network_type][node_type])
    # print(start_idx_mp)

    for edge_type in edge_types:
        node1_t, node2_t = get_node_types(edge_type)
        extended_edges = edges[network_type][edge_type]
        if start_idx_mp[node1_t] or start_idx_mp[node2_t]:
            print(f"node1_t={node1_t}, node2_t={node2_t}, start_idx_node1_t={start_idx_mp[node1_t]}, start_idx_node2_t={start_idx_mp[node2_t]}")
            extended_edges = extend_edges(extended_edges, start_idx_mp[node1_t], start_idx_mp[node2_t])
        matrices[network_type][edge_type] = create_sparse(extended_edges, indices, indices)


node1_t=User, node2_t=Tweet, start_idx_node1_t=0, start_idx_node2_t=5223
node1_t=User, node2_t=Location, start_idx_node1_t=0, start_idx_node2_t=55194
node1_t=User, node2_t=Tweet, start_idx_node1_t=0, start_idx_node2_t=5392
node1_t=User, node2_t=Location, start_idx_node1_t=0, start_idx_node2_t=52008


In [7]:
# Initial Features
#   Use Word2Vec to initialize those Tweet Nodes
#   Else use Random Assignment to initialize other Nodes, User Nodes and Location Nodes
import torch

initial_features = {}

for network_type in network_types:
    indices = 0
    for node_type in node_types:
        indices += len(nodes[network_type][node_type])
    initial_features[network_type] = torch.rand(indices, 100)

hadj = {}
for network_type in network_types:
    hadj[network_type] = []
    for edge_type in edge_types:
        adj = get_sparse_tensor(matrices[network_type][edge_type])
        hadj[network_type].append(adj)


In [10]:
# Training and Testing
import torch.optim as optim
from itertools import chain
from model import HeterogeneousGraphAttention
from loss import TypeAwareAlignmentLoss

# 1. Prepare Data
alm = get_anchor_link_matrix(nodes)
train_row, train_col, test_row, test_col = get_train_test_pairs(nodes)

# 2. Model, Optimizer, Loss
model_twitter = HeterogeneousGraphAttention(n_user=len(nodes["Twitter"]["User"]), n_units=[100, 128], gpu_device_ids=[6,7,8])
model_fsquare = HeterogeneousGraphAttention(n_user=len(nodes["Foursquare"]["User"]), n_units=[100, 128])
# if torch.cuda.is_available():
#     model_twitter.cuda()
#     model_fsquare.cuda()

optimizer = optim.Adam(chain(model_twitter.parameters(), model_fsquare.parameters()), lr=1e-3, weight_decay=5e-4)

loss_fn = TypeAwareAlignmentLoss()

In [11]:
# Training

model_twitter.train()
model_fsquare.train()

optimizer.zero_grad()

x1 = initial_features["Twitter"].to('cuda:1')
hadj1 = [elem.to('cuda:1') for elem in hadj["Twitter"]]
# model_twitter = model_twitter.to('cuda:1')
output_tw = model_twitter(x1, hadj1)

x2 = initial_features["Foursquare"].to('cuda:6')
hadj2 = [elem.to('cuda:6') for elem in hadj["Foursquare"]]
model_fsquare = model_fsquare.to('cuda:6')
output_fs = model_fsquare(x2, hadj2)

# loss_train = loss_fn(alm, output_tw, output_fs)

# loss_train.backward()
# optimizer.step()

# logger.info(f"Train in Epoch={0}: Loss={loss_train:f}")

In [13]:
output_tw = output_tw.to('cuda:9')
output_fs = output_fs.to('cuda:9')

loss_train = loss_fn(alm, output_tw[:500], output_fs)


In [None]:
loss_train.backward()

In [15]:
for i in range(torch.cuda.device_count()):
    logger.info(f"Device idx={i} Allocated: {torch.cuda.memory_allocated(i)/1024**3}")
    logger.info(f"Device idx={i} Cached: {torch.cuda.memory_reserved(i)/1024**3}")

2022-07-22 19:09:50,903 Device idx=0 Allocated: 0.7221593856811523
2022-07-22 19:09:50,908 Device idx=0 Cached: 0.787109375
2022-07-22 19:09:50,909 Device idx=1 Allocated: 0.1059103012084961
2022-07-22 19:09:50,911 Device idx=1 Cached: 0.197265625
2022-07-22 19:09:50,913 Device idx=2 Allocated: 0.0
2022-07-22 19:09:50,915 Device idx=2 Cached: 0.0
2022-07-22 19:09:50,916 Device idx=3 Allocated: 0.0
2022-07-22 19:09:50,917 Device idx=3 Cached: 0.001953125
2022-07-22 19:09:50,919 Device idx=4 Allocated: 0.0
2022-07-22 19:09:50,921 Device idx=4 Cached: 0.001953125
2022-07-22 19:09:50,922 Device idx=5 Allocated: 0.0
2022-07-22 19:09:50,924 Device idx=5 Cached: 0.001953125
2022-07-22 19:09:50,925 Device idx=6 Allocated: 4.2127203941345215
2022-07-22 19:09:50,927 Device idx=6 Cached: 5.560546875
2022-07-22 19:09:50,928 Device idx=7 Allocated: 4.662710189819336
2022-07-22 19:09:50,930 Device idx=7 Cached: 7.896484375
2022-07-22 19:09:50,931 Device idx=8 Allocated: 1.996872901916504
2022-07-22 

In [None]:
# # Training

# model_twitter.train()
# model_fsquare.train()

# optimizer.zero_grad()

# output_tw = model_twitter(initial_features["Twitter"], hadj["Twitter"])
# output_fs = model_fsquare(initial_features["Foursquare"], hadj["Foursquare"])

# loss_train = loss_fn(alm, output_tw, output_fs)

# loss_train.backward()
# optimizer.step()

# logger.info(f"Train in Epoch={0}: Loss={loss_train:f}")

In [None]:
# # Evaluating

# k = 30
# ratio = 0.8

# model_twitter.eval()
# model_fsquare.eval()

# output_tw = model_twitter(initial_features["Twitter"], hadj["Twitter"])
# output_fs = model_fsquare(initial_features["Foursquare"], hadj["Foursquare"])
# print("Allocated: ", torch.cuda.memory_allocated(0)/1024**3)

# loss_val = loss_fn(alm, output_tw, output_fs)

# print("Allocated: ", torch.cuda.memory_allocated(0)/1024**3)

In [2]:
# Word2Vec Initial Features

import word2vec

model = word2vec.load('../data/raw2.bin', encoding="ISO-8859-1", new_lines=False)

In [None]:
from utils import df_twitter_tweets

for _, row in df_twitter_tweets.iterrows():
    indexes, metrics = model.similar(row["text"])
    emb = model.generate_response(indexes, metrics).tolist()
    print(emb)
    emb = emb[0]
    break