In [1]:
#model.py
import torch
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero


class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        # Mean pooling, by default
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class Model(torch.nn.Module):
    def __init__(self, hidden_channels, data):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr="sum")

    def forward(self, x_dict, edge_index_dict):
        # Node embedding here
        return self.encoder(x_dict, edge_index_dict)

    def inference(self, x_dict, edge_index_dict):
        return self.encoder(x_dict, edge_index_dict)


def prepare_data_loader(
    data,
    batch_size,
    num_neighbours,
):
    neighbourhood_sampling_loader = NeighborLoader(
        data,
        batch_size=batch_size,
        num_neighbors=num_neighbours,
        shuffle=True,
        input_nodes=('listing', None)
    )
    return neighbourhood_sampling_loader


def load_model(path, hidden_channels, data):
    model = Model(hidden_channels=hidden_channels, data=data).to("cpu")
    model.load_state_dict(torch.load(path))
    return model

#main_train.py
import wandb
import pandas as pd
from loguru import logger
import torch
import torch.nn.functional as F
from datetime import datetime as dt
import os
from dateutil.relativedelta import relativedelta  # type: ignore
import functools

from train_test_split import *
from evaluate import *
from constants import *


def initialize_run():
    try:
        api = wandb.Api()
        runs = api.runs(f"{ENTITY_NAME}/{PROJECT_NAME}")
        if len(runs) == 0:
            raise Exception("No runs")
        last_iteration = float("-inf")
        end_date_of_last_iteration = None

        for run in runs:
            config = {k: v for k, v in run.config.items() if not k.startswith("_")}
            run_name = run.name
            iteration_no = int(run_name.split("_")[1])
            if iteration_no > last_iteration:
                last_iteration = iteration_no
                end_date_of_last_iteration = config["end_date"]

        current_iteration = last_iteration + 1
        current_start_date = dt.strptime(
            end_date_of_last_iteration, "%Y-%m-%d"
        ).date() + relativedelta(days=1)
    except Exception as e:
        print(e)
        current_iteration = 0
        current_start_date = dt.strptime(SIMULATION_START_DATE, "%Y-%m-%d").date()

    return current_iteration, current_start_date


def filter_test_data_by_scenario(train_reviews, test_reviews, user_col, scenario_type):
    if scenario_type == "cold_start_new_user":
        train_reviewers = list(train_reviews[user_col].unique())
        return test_reviews[~test_reviews[user_col].isin(train_reviewers)]


def get_nunique(df, col):
    return df[col].nunique()

iteration = 0

start_date = dt.strptime("2021-10-24", "%Y-%m-%d").date()
if start_date == dt.strptime(MAX_START_DATE, "%Y-%m-%d").date():
    raise Exception("Stop Simulation")
end_date, nxt_start_date = split_date_by_period_months(start_date, TOTAL_MONTHS_PER_ITERATION)

logger.info("Start of Retraining")
print(iteration)
print(start_date, end_date)
directory = "/Users/yhchan/Downloads/FYP/data/processed"
reviews = pd.read_parquet(f"{directory}/reviews_with_interactions.parquet")
listings = pd.read_parquet(f"{directory}/listings_with_interactions.parquet")
reviewers = pd.read_parquet(f"{directory}/reviewers_with_interactions.parquet")


config={
    "architecture": "Unsupervised GraphSAGE",
    "iteration": iteration,
    "start_date": start_date,
    "end_date": end_date,
    "learning_rate": 0.01,
    "hidden_channels": 64,
    "train_batch_size": 128,
    "test_batch_size": 128,
    "epochs": 100,
    "train_num_neighbours": [10, 10],
    "test_num_neighbours": [-1],  # So no sampling happens
    "train_split_period_months": TRAIN_SPLIT_PERIOD_MONTHS,
    "total_months_of_data": TOTAL_MONTHS_PER_ITERATION,
}


# Split into train, test and test for cold start scenario
(
    train_reviews,
    train_listings,
    train_reviewers,
    test_reviews,
    test_listings,
    test_reviewers,
) = main_train_val_test(
    reviews,
    listings,
    reviewers,
    start_date,
    end_date,
    config["train_split_period_months"],
)
cold_start_test_reviews = filter_test_data_by_scenario(
    train_reviews, test_reviews, "reviewer_id", "cold_start_new_user"
)
cold_start_test_listings, cold_start_test_reviewers = build_partitioned_data(
    cold_start_test_reviews, listings, reviewers
)

def build_heterograph(reviews, listings, include_rating_as_edge_label=False):

    listing_features_cols = FEATURE_COLS["features_cols"]

    # Convert data type
    for col in listings[listing_features_cols].select_dtypes(include=["bool"]):
        listings[col] = listings[col].astype("category")

    user_x, user_mapping = load_node_from_df(reviews, index_col="reviewer_id")
    listing_x, listing_mapping = load_node_from_df(
        listings, index_col="listing_id", features_cols=listing_features_cols
    )
    edge_index, edge_label = load_edge_from_df(
        reviews,
        src_index_col="reviewer_id",
        src_mapping=user_mapping,
        dst_index_col="listing_id",
        dst_mapping=listing_mapping,
        encoders={"rating": IdentityEncoder(dtype=torch.long)},
    )

    data = HeteroData()
    num_user_nodes = len(user_mapping)
    print(edge_label)
    data["listing"].x = listing_x
    data["user"].x = torch.arange(num_user_nodes).view(num_user_nodes, 1).to(torch.float32)
    data["user", "rates", "listing"].edge_index = edge_index
    if include_rating_as_edge_label:
        data["user", "rates", "listing"].edge_label = edge_label
        data["user", "rates", "listing"].edge_label_index = edge_index
    # We can now convert `data` into an appropriate format for training a
    # graph-based machine learning model:

    # 1. Add a reverse ('listing', 'rev_rates', 'user') relation for message passing.
    data = ToUndirected()(data)
    del data["listing", "rev_rates", "user"].edge_label  # Remove "reverse" label.
    return data

# Build idx to id dict and reverse version of it
test_listings2dict = get_entity2dict(test_listings, "listing_id")
reverse_test_listings2dict = {k: v for v, k in test_listings2dict.items()}
test_reviewers2dict = get_entity2dict(test_reviewers, "reviewer_id")
reverse_test_reviewers2dict = {k: v for v, k in test_reviewers2dict.items()}
cold_start_test_listings2dict = get_entity2dict(cold_start_test_listings, "listing_id")
reverse_cold_start_test_listings2dict = {k: v for v, k in cold_start_test_listings2dict.items()}
cold_start_test_reviewers2dict = get_entity2dict(cold_start_test_reviewers, "reviewer_id")
reverse_cold_start_test_reviewers2dict = {
    k: v for v, k in cold_start_test_reviewers2dict.items()
}

# Build Graph
data = build_heterograph(reviews, listings, True)
train_data = build_heterograph(train_reviews, train_listings, True)
test_data = build_heterograph(test_reviews, test_listings, True)
cold_start_test_data = build_heterograph(cold_start_test_reviews, cold_start_test_listings, True)
print("Training Heterogenous Graph", train_data)
print("Test Heterogenous Graph", test_data)
print("Test Heterogenous Graph (Cold Start Scenerio)", cold_start_test_data)

train_loader = prepare_data_loader(
    data=train_data,
    batch_size=config["train_batch_size"],
    num_neighbours=config["train_num_neighbours"],
)
test_loader = prepare_data_loader(
    data=test_data,
    batch_size=config["test_batch_size"],
    num_neighbours=config["test_num_neighbours"],
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = train_data.to(device)

train_test_data_dict = {
    "num_reviews": len(train_reviews) + len(test_reviews),
    "num_train_reviews": len(train_reviews),
    "num_test_reviews": len(test_reviews),
    "num_cold_start_test_reviews": len(cold_start_test_reviews),
    "num_unique_cold_start_test_listings": get_nunique(cold_start_test_reviews, "listing_id"),
    "num_unique_cold_start_test_reviewers": get_nunique(cold_start_test_reviews, "reviewer_id"),
    "num_unique_train_listings": get_nunique(train_listings, "listing_id"),
    "num_unique_test_listings": get_nunique(test_listings, "listing_id"),
    "num_unique_train_reviewers": get_nunique(train_reviewers, "reviewer_id"),
    "num_unique_test_reviewers": get_nunique(test_reviewers, "reviewer_id"),
}
train_reviews.to_parquet("train/train_reviews.parquet", index=False)
train_listings.to_parquet("train/train_listings.parquet", index=False)
train_reviewers.to_parquet("train/train_reviewers.parquet", index=False)
test_reviews.to_parquet("test/test_reviews.parquet", index=False)
test_listings.to_parquet("test/test_listings.parquet", index=False)
test_reviewers.to_parquet("test/test_reviewers.parquet", index=False)
cold_start_test_reviews.to_parquet("test/cold_start_test_reviews.parquet", index=False)
cold_start_test_listings.to_parquet("test/cold_start_test_listings.parquet", index=False)
cold_start_test_reviewers.to_parquet("test/cold_start_test_reviewers.parquet", index=False)

print(train_test_data_dict)

2023-04-03 16:30:16.338 | INFO     | __main__:<module>:115 - Start of Retraining


0
2021-10-24 2022-10-23


2023-04-03 16:30:20.589 | INFO     | train_test_split:main_train_val_test:133 - Split df into train and test portion
  temp = torch.from_numpy(val).view(-1, 1).to(torch.float32)


tensor([5, 5, 4,  ..., 5, 4, 5])
tensor([5, 4, 5,  ..., 5, 5, 4])
tensor([5, 5, 5,  ..., 5, 5, 5])
tensor([5, 5, 5,  ..., 5, 5, 5])
Training Heterogenous Graph HeteroData(
  [1mlisting[0m={ x=[17229, 159] },
  [1muser[0m={ x=[324135, 1] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 334678],
    edge_label=[334678],
    edge_label_index=[2, 334678]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 334678] }
)
Test Heterogenous Graph HeteroData(
  [1mlisting[0m={ x=[14380, 159] },
  [1muser[0m={ x=[72447, 1] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 73918],
    edge_label=[73918],
    edge_label_index=[2, 73918]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 73918] }
)
Test Heterogenous Graph (Cold Start Scenerio) HeteroData(
  [1mlisting[0m={ x=[14254, 159] },
  [1muser[0m={ x=[70416, 1] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 71775],
    edge_label=[71775],
    edge_label_index=[2, 71775]
  },
  [1m(listing, r

In [3]:
import pandas as pd 

a = [{"a":2 ,"b":1},{"a":2 ,"b":1}]
pd.DataFrame(a)

Unnamed: 0,a,b
0,2,1
1,2,1


In [402]:
from torchmetrics.functional import pairwise_cosine_similarity


# Modelling
model = Model(hidden_channels=config["hidden_channels"], data=data).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])


def triplet_ranking_loss(emb, rating, edge_index, batch_size, margin_constant=0.1):
    # Compute pairwise cosine similarity between user embedding and listing embedding
    cosine_similarity_matrix = pairwise_cosine_similarity(emb["user"], emb["listing"][:batch_size])
    # Sample adjacency matrix (binary) represented by two tensors
    src_tensor = edge_index[0]
    dst_tensor = edge_index[1]
    max_listing_idx = dst_tensor.max()
    # Number of nodes
    n_nodes = max(src_tensor.max(), max_listing_idx) + 1

    # Convert adjacency matrix to dictionary
    adj_dict = {}
    edge_label_dict = {}
    for i in range(n_nodes):
        adj_dict[i] = list(dst_tensor[src_tensor == i].numpy())
        edge_label_dict[i] = list(rating[src_tensor == i].numpy())

    def exclude_elem(a, b):
        mask = torch.ones_like(b, dtype=torch.bool)
        mask[a] = 0
        return torch.masked_select(b, mask)

    listing_indices = torch.arange(max_listing_idx + 1)
    hardest_pos_sims = []
    hardest_neg_sims = []
    ratings = []
    for i in adj_dict:
        pos_idx = torch.tensor(adj_dict[i])
        neg_idx = exclude_elem(pos_idx, listing_indices)
        # If there are no negative examples, skip this user
        if len(neg_idx) == 0:
            continue

        # Retrieve distance between the positive and negative examples
        pos_sim = cosine_similarity_matrix[i, pos_idx]
        neg_sim = cosine_similarity_matrix[i, neg_idx]
        # Select the hardest negative example and hardest postive example
        hardest_pos_idx = torch.argmin(pos_sim)
        hardest_pos_sim = pos_sim[hardest_pos_idx]
        pos_rating = torch.tensor(edge_label_dict[i])[hardest_pos_idx]
        hardest_neg_idx = torch.argmax(neg_sim)
        hardest_neg_sim = neg_sim[hardest_neg_idx]
        hardest_pos_sims.append(hardest_pos_sim)
        hardest_neg_sims.append(hardest_neg_sim)
        ratings.append(pos_rating)

    hardest_pos_sims = torch.stack(hardest_pos_sims, dim=0)
    hardest_neg_sims = torch.stack(hardest_neg_sims, dim=0)    
    ratings = torch.stack(ratings, dim=0)
    m = ratings * margin_constant

    # Combine most disimilar s(a, p) and most similar s(a, n) into final triplet loss
    triplet_loss = torch.maximum(torch.zeros(m.size()), - hardest_pos_sims + hardest_neg_sims + m)
    return triplet_loss.mean()

def train():
    model.train(True)
    total_examples = total_loss = 0
    # Why using mini-batch gradient descent
    # Update NN multiple times every epoch, Make more precise update to the parameters by calculating the average loss in each step
    # Reduce overall training time and num of required epochs for reaching convergence, computational efficiency
    i = 0
    n_samples = 0
    for batch in train_loader:
        batch = batch.to(device)
        # Zero gradients for every batch
        optimizer.zero_grad()
        # Make predictions for this batch
        emb = model(batch.x_dict, batch.edge_index_dict)
        batch_size = batch['listing'].batch_size
        rating = batch["user", "listing"].edge_label
        edge_index = batch["user", "listing"].edge_index
        loss = triplet_ranking_loss(emb, rating, edge_index, batch_size)
        # Compute the loss and its gradients
        loss.backward()
        # Adjust learning weights
        optimizer.step()
        total_loss += float(loss) * batch_size
        total_examples += batch_size

    train_loss = total_loss / total_examples
    return train_loss

@torch.no_grad()
def test(test_data_loader, test_data, model):
    model.eval()
    total_examples = total_loss = 0
    for batch in test_data_loader:
        batch = batch.to(device)
        # Make predictions for this batch
        emb = model(batch.x_dict, batch.edge_index_dict)
        batch_size = batch['listing'].batch_size
        rating = batch["user", "listing"].edge_label
        edge_index = batch["user", "listing"].edge_index
        loss = triplet_ranking_loss(emb, rating, edge_index, batch_size)
        total_loss += float(loss) * batch_size
        total_examples += batch_size
        
    test_loss = total_loss / total_examples
    return test_loss

best_train_loss = float("inf")
best_test_loss = float("inf")
best_model_path = None
# Train and Evaluate Loss
test_wrapper = functools.partial(test, test_loader, test_data)

for epoch in range(1, config["epochs"] + 1):
    model_is_best = False
    train_loss = train()
    test_loss = test_wrapper(model)

    if train_loss < best_train_loss:
        best_train_loss = train_loss

    if test_loss < best_test_loss:
        best_test_loss = test_loss
        model_is_best = True

    metrics_dict = {
        "train_loss": train_loss,
        "test_loss": test_loss,
        "epoch": epoch,
    }
    logger.info(
        f"Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f} "
    )

logger.info("End of Retraining")


2023-04-01 19:58:43.795 | INFO     | __main__:<module>:133 - Epoch: 001, Train Loss: 0.6289, Test Loss: 0.6297 
2023-04-01 19:59:03.103 | INFO     | __main__:<module>:133 - Epoch: 002, Train Loss: 0.6299, Test Loss: 0.6305 
2023-04-01 19:59:22.439 | INFO     | __main__:<module>:133 - Epoch: 003, Train Loss: 0.6286, Test Loss: 0.6297 
2023-04-01 19:59:41.468 | INFO     | __main__:<module>:133 - Epoch: 004, Train Loss: 0.6283, Test Loss: 0.6294 
2023-04-01 20:00:00.604 | INFO     | __main__:<module>:133 - Epoch: 005, Train Loss: 0.6284, Test Loss: 0.6298 
2023-04-01 20:00:19.838 | INFO     | __main__:<module>:133 - Epoch: 006, Train Loss: 0.6279, Test Loss: 0.6309 
2023-04-01 20:00:38.884 | INFO     | __main__:<module>:133 - Epoch: 007, Train Loss: 0.6276, Test Loss: 0.6369 
2023-04-01 20:00:57.997 | INFO     | __main__:<module>:133 - Epoch: 008, Train Loss: 0.6271, Test Loss: 0.6475 
2023-04-01 20:01:17.191 | INFO     | __main__:<module>:133 - Epoch: 009, Train Loss: 0.6277, Test Loss: 

2023-04-01 20:22:44.691 | INFO     | __main__:<module>:133 - Epoch: 075, Train Loss: 0.5445, Test Loss: 1.0079 
2023-04-01 20:23:03.712 | INFO     | __main__:<module>:133 - Epoch: 076, Train Loss: 0.5410, Test Loss: 1.0608 
2023-04-01 20:23:22.762 | INFO     | __main__:<module>:133 - Epoch: 077, Train Loss: 0.5389, Test Loss: 1.1014 
2023-04-01 20:23:41.784 | INFO     | __main__:<module>:133 - Epoch: 078, Train Loss: 0.5383, Test Loss: 1.0186 
2023-04-01 20:24:00.766 | INFO     | __main__:<module>:133 - Epoch: 079, Train Loss: 0.5350, Test Loss: 1.0230 
2023-04-01 20:24:19.824 | INFO     | __main__:<module>:133 - Epoch: 080, Train Loss: 0.5304, Test Loss: 0.9342 
2023-04-01 20:24:38.856 | INFO     | __main__:<module>:133 - Epoch: 081, Train Loss: 0.5337, Test Loss: 0.9360 
2023-04-01 20:24:57.869 | INFO     | __main__:<module>:133 - Epoch: 082, Train Loss: 0.5308, Test Loss: 0.9052 
2023-04-01 20:25:16.925 | INFO     | __main__:<module>:133 - Epoch: 083, Train Loss: 0.5318, Test Loss: 

KeyboardInterrupt: 

In [63]:
len(train_reviews)

334678

In [272]:
batch = next(iter(train_loader))
edge_index = batch["user", "listing"].edge_index
print(batch)
print((edge_index))

HeteroData(
  [1mlisting[0m={
    x=[164, 159],
    input_id=[128],
    batch_size=128
  },
  [1muser[0m={ x=[940, 1] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 945],
    edge_label=[945],
    edge_label_index=[2, 945]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 987] }
)
tensor([[  0,   1,   2,  ..., 937, 938, 939],
        [  0,   0,   0,  ..., 127, 127, 127]])


In [307]:
edge_index

tensor([[  0,   1,   2,  ..., 937, 938, 939],
        [  0,   0,   0,  ..., 127, 127, 127]])

tensor(0.6335, grad_fn=<MeanBackward0>)

In [392]:
m.size()[0]

940

In [350]:
edge_label_dict

{0: [5],
 1: [5],
 2: [5],
 3: [5],
 4: [5],
 5: [5],
 6: [5],
 7: [5],
 8: [5],
 9: [5],
 10: [5],
 11: [5],
 12: [5],
 13: [5],
 14: [5],
 15: [5],
 16: [5],
 17: [5],
 18: [5],
 19: [5],
 20: [5],
 21: [5],
 22: [4],
 23: [5],
 24: [5],
 25: [5],
 26: [5],
 27: [5],
 28: [5],
 29: [5],
 30: [5],
 31: [5],
 32: [5],
 33: [4],
 34: [4],
 35: [5],
 36: [5],
 37: [5],
 38: [5],
 39: [5],
 40: [5],
 41: [5],
 42: [4],
 43: [5],
 44: [5],
 45: [5],
 46: [5],
 47: [5],
 48: [5],
 49: [5],
 50: [4],
 51: [5],
 52: [5],
 53: [5],
 54: [5],
 55: [5, 5],
 56: [5],
 57: [5],
 58: [5],
 59: [5],
 60: [5],
 61: [5],
 62: [5],
 63: [5],
 64: [5],
 65: [5],
 66: [5],
 67: [5],
 68: [5],
 69: [5],
 70: [3],
 71: [5],
 72: [5],
 73: [5],
 74: [5],
 75: [4],
 76: [5],
 77: [5],
 78: [4],
 79: [4],
 80: [5],
 81: [5],
 82: [5],
 83: [5],
 84: [5],
 85: [5],
 86: [5],
 87: [4],
 88: [5],
 89: [5],
 90: [5],
 91: [5],
 92: [5],
 93: [5],
 94: [5],
 95: [5],
 96: [5],
 97: [5],
 98: [5],
 99: [5],
 100: [

In [338]:
# torch.stack(pos_pairs, dim=0)

In [284]:
c = Counter(edge_index[0].tolist())

for key in c:
    if c[key] > 1:
        print(key)
c

55
389
650
748
778


Counter({0: 1,
         1: 1,
         2: 1,
         3: 1,
         4: 1,
         5: 1,
         6: 1,
         7: 1,
         8: 1,
         9: 1,
         10: 1,
         11: 1,
         12: 1,
         13: 1,
         14: 1,
         15: 1,
         16: 1,
         17: 1,
         18: 1,
         19: 1,
         20: 1,
         21: 1,
         22: 1,
         23: 1,
         24: 1,
         25: 1,
         26: 1,
         27: 1,
         28: 1,
         29: 1,
         30: 1,
         31: 1,
         32: 1,
         33: 1,
         34: 1,
         35: 1,
         36: 1,
         37: 1,
         38: 1,
         39: 1,
         40: 1,
         41: 1,
         42: 1,
         43: 1,
         44: 1,
         45: 1,
         46: 1,
         47: 1,
         48: 1,
         49: 1,
         50: 1,
         51: 1,
         52: 1,
         53: 1,
         54: 1,
         55: 2,
         56: 1,
         57: 1,
         58: 1,
         59: 1,
         60: 1,
         61: 1,
         62: 1,
  

In [269]:
temp = torch.clone(edge_index[0])
edge_index[0] = edge_index[1]
edge_index[1] = temp


In [270]:
edge_index

tensor([[  0,   0,   0,  ..., 127, 127, 127],
        [  0,   1,   2,  ..., 944, 945, 946]])

In [250]:
aa = etedge_index[1]

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   2,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   4,   4,   4,   4,   5,   6,   7,   7,   7,   7,   7,
          7,   7,   7,   7,   8,   9,   9,   9,   9,   9,   9,   9,   9,   9,
          9,  10,  10,  10,  10,  10,  10,  10,  10,  11,  11,  12,  12,  13,
         13,  13,  13,  13,  13,  13,  13,  13,  13,  14,  14,  14,  14,  14,
         14,  14,  14,  14,  14,  15,  15,  15,  16,  16,  16,  16,  16,  16,
         16,  16,  16,  16,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
         18,  18,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  20,  20,
         21,  21,  21,  21,  21,  21,  21,  21,  21,  22,  22,  22,  22,  22,
         22,  22,  22,  22,  22,  23,  24,  24,  24,  25,  25,  25,  25,  25,
         25,  25,  26,  27,  27,  27,  27,  28,  29,  29,  29,  29,  29,  29,
         29,  29,  29,  29,  30,  30,  30,  30,  31,  31,  31,  

In [245]:
ei = torch.as_tensor([[0, 0, 1, 2],
                              [0, 1, 2, 3]])
structured_negative_sampling(ei)

(tensor([0, 0, 1, 2]), tensor([0, 1, 2, 3]), tensor([2, 3, 3, 1]))

In [146]:
from collections import Counter
Counter(edge_index[0].tolist())

Counter({0: 1,
         1: 1,
         2: 1,
         3: 1,
         4: 1,
         5: 1,
         6: 1,
         7: 1,
         8: 1,
         9: 1,
         10: 1,
         11: 1,
         12: 1,
         13: 1,
         14: 1,
         15: 1,
         16: 1,
         17: 1,
         18: 1,
         19: 1,
         20: 1,
         21: 1,
         22: 1,
         23: 1,
         24: 1,
         25: 1,
         26: 1,
         27: 1,
         28: 1,
         29: 1,
         30: 1,
         31: 1,
         32: 1,
         33: 1,
         34: 1,
         35: 1,
         36: 1,
         37: 1,
         38: 1,
         39: 1,
         40: 1,
         41: 1,
         42: 1,
         43: 1,
         44: 1,
         45: 1,
         46: 1,
         47: 1,
         48: 1,
         49: 1,
         50: 1,
         51: 1,
         52: 1,
         53: 1,
         54: 1,
         55: 1,
         56: 1,
         57: 1,
         58: 1,
         59: 1,
         60: 1,
         61: 1,
         62: 1,
  

--------

In [None]:
directory = "../../../data/processed"
separate_date = "2022-01"

# Load data
reviews = pd.read_parquet(f"{directory}/reviews_with_interactions.parquet")
listings = pd.read_parquet(f"{directory}/listings_with_interactions.parquet")
reviewers = pd.read_parquet(f"{directory}/reviewers_with_interactions.parquet")

print("Full: ", len(reviews), len(listings), len(reviewers))
# Prepare data and graph
(
    train_reviews,
    train_listings,
    train_reviewers,
    test_reviews,
    test_listings,
    test_reviewers,
) = train_test_split(reviews, listings, reviewers, separate_date)
test_listings2dict = get_entity2dict(test_listings, "listing_id")
reverse_test_listings2dict = {k: v for v, k in test_listings2dict.items()}
test_reviewers2dict = get_entity2dict(test_reviewers, "reviewer_id")
reverse_test_reviewers2dict = {k: v for v, k in test_reviewers2dict.items()}

In [66]:
for batch in train_loader:
    print(batch)

HeteroData(
  [1mlisting[0m={
    x=[182, 159],
    input_id=[128],
    batch_size=128
  },
  [1muser[0m={ x=[850, 1] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 853],
    edge_label=[853],
    edge_label_index=[2, 853]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 917] }
)
HeteroData(
  [1mlisting[0m={
    x=[179, 159],
    input_id=[128],
    batch_size=128
  },
  [1muser[0m={ x=[874, 1] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 885],
    edge_label=[885],
    edge_label_index=[2, 885]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 956] }
)
HeteroData(
  [1mlisting[0m={
    x=[178, 159],
    input_id=[128],
    batch_size=128
  },
  [1muser[0m={ x=[948, 1] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 949],
    edge_label=[949],
    edge_label_index=[2, 949]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 1013] }
)
HeteroData(
  [1mlisting[0m={
    x=[173, 159],
    input_id=[128],
    batch_size

In [26]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1442687 entries, 0 to 1459535
Data columns (total 21 columns):
 #   Column                 Non-Null Count    Dtype              
---  ------                 --------------    -----              
 0   listing_id             1442687 non-null  object             
 1   id                     1442687 non-null  object             
 2   rating                 1442687 non-null  int64              
 3   comments               1442687 non-null  object             
 4   localized_comments     440951 non-null   object             
 5   response               165434 non-null   object             
 6   localized_response     40491 non-null    object             
 7   language               1442627 non-null  object             
 8   created_at             1442687 non-null  datetime64[ns, UTC]
 9   localized_date         1442687 non-null  object             
 10  reviewee_id            1442687 non-null  object             
 11  reviewee_first_name    1

In [30]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20799 entries, 0 to 24286
Columns: 162 entries, num_of_guest_capacity to listing_id
dtypes: category(132), datetime64[ns](1), float64(27), object(2)
memory usage: 7.6+ MB


In [44]:
reviewers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1359391 entries, 0 to 1378156
Data columns (total 2 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   reviewer_id           1359391 non-null  object
 1   reviewer_picture_url  1359391 non-null  object
dtypes: object(2)
memory usage: 31.1+ MB


In [45]:
reviews['created_at'].min()

Timestamp('2009-05-23 16:27:17+0000', tz='UTC')

In [46]:
reviews['created_at'].max()

Timestamp('2022-10-23 13:01:08+0000', tz='UTC')

In [2]:
data.num_nodes

1380190

In [None]:
df[col] = df[col].astype('category')

In [33]:
model.eval()
embeddings = model.inference(test_data.x_dict, test_data.edge_index_dict)
user_embeddings = embeddings["user"]
listing_embeddings = embeddings["listing"]

In [56]:
import numpy as np
import torch

from enum import Enum


class RECOMMENDENTATION_TYPE(Enum):
    USER_TO_ITEM = "USER_TO_ITEM"
    ITEM_TO_ITEM = "ITEM_TO_ITEM"


def get_entity2dict(df, id_col):
    entity2dict = {}

    for idx, _id in enumerate(df[id_col].to_list()):
        entity2dict[_id] = idx

    return entity2dict


def get_similar_listings_by_graph_embeddings(
    query_listing_idx, listing_embeddings, reverse_test_listings2dict, K
):
    """
    Generate the top-k closest listings with the first listing the reviewers has reviewed in terms of embeddings
    """
    query_listing_embedding = listing_embeddings[query_listing_idx]
    cos_t = torch.nn.CosineSimilarity(dim=1)(query_listing_embedding, listing_embeddings)
    top_ids = torch.argsort(-cos_t).numpy()
    query_listing_id = reverse_test_listings2dict[query_listing_idx]
    recommendation_list = []
    for index in top_ids:
        if len(recommendation_list) == K:
            break
        candidate_listing_id = reverse_test_listings2dict[index]
        if candidate_listing_id != query_listing_id:
            recommendation_list.append(candidate_listing_id)
    return recommendation_list


def get_recommended_listings_by_graph_embeddings(
    query_user_embedding, listing_embeddings, reverse_test_listings2dict, K
):
    """
    for each user, generate the top-k closest listing in terms of embeddings
    """
    cos_t = torch.nn.CosineSimilarity(dim=1)(query_user_embedding, listing_embeddings)
    top_ids = torch.argsort(-cos_t).numpy()[:K]
    def get_id_by_index(index):
        return reverse_test_listings2dict[index]
    recommendation_list = np.vectorize(get_id_by_index)(top_ids)
    return recommendation_list


import concurrent.futures



if __name__ == '__main__':
    main()

def prepare_evaluation_pairs(
    rec_type,
    test_reviews,
    test_listings,
    user_embeddings,
    listing_embeddings,
    test_listings2dict,
    reverse_test_listings2dict,
    test_reviewers2dict,
    reverse_test_reviewers2dict,
    K,
):
    # Generate (num_user, K) recommendation matrix
    # Generate (num_user, x) ground truth matrix where x is unsure
    recommendations = []
    ground_truths = []
    if rec_type == RECOMMENDENTATION_TYPE.USER_TO_ITEM.value:
        n_users = user_embeddings.shape[0]
        with concurrent.futures.ProcessPoolExecutor() as executor:
        for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
            print('%d is prime: %s' % (number, prime))
        for user_idx in range(n_users):
            print(user_idx)
            query_user_embedding = user_embeddings[user_idx]
            recommendation_list = get_recommended_listings_by_graph_embeddings(
                query_user_embedding, listing_embeddings, reverse_test_listings2dict, K
            )
            query_user_id = reverse_test_reviewers2dict[user_idx]
            ground_truth_list = list(
                test_reviews[test_reviews["reviewer_id"] == query_user_id]["listing_id"].values
            )
            recommendations.append(recommendation_list)
            ground_truths.append(ground_truth_list)

    elif rec_type == RECOMMENDENTATION_TYPE.ITEM_TO_ITEM.value:
        v = test_reviews["reviewer_id"].value_counts()
        reviews_that_user_interaction_more_than_once = test_reviews[
            test_reviews["reviewer_id"].isin(v.index[v.gt(1)])
        ]
        for reviewer_id in reviews_that_user_interaction_more_than_once["reviewer_id"].unique():
            user_interactions = reviews_that_user_interaction_more_than_once[
                reviews_that_user_interaction_more_than_once["reviewer_id"] == reviewer_id
            ]
            assert len(user_interactions) > 1
            user_first_interaction = user_interactions.sort_values(by="created_at").iloc[0]
            query_listing_id = user_first_interaction["listing_id"]
            query_listing_idx = test_listings2dict[query_listing_id]
            recommendation_list = get_similar_listings_by_graph_embeddings(
                query_listing_idx, listing_embeddings, reverse_test_listings2dict, K
            )
            ground_truth_list = user_interactions[
                ~user_interactions["id"].isin([user_first_interaction["id"]])
            ]["listing_id"].values

            assert len(ground_truth_list) != 0
            recommendations.append(recommendation_list)
            ground_truths.append(ground_truth_list)

    return np.array(recommendations), np.array(ground_truths)


def hit_rate(recommendations, ground_truths):
    n_users = recommendations.shape[0]
    hits = []

    for user_idx in range(n_users):
        recommendation = recommendations[user_idx]
        ground_truth = ground_truths[user_idx]
        if len(set(recommendation).intersection(ground_truth)) > 0:
            hit = 1
        else:
            hit = 0
        hits.append(hit)
    return np.array(hits).mean()


def evaluate_nn(
    user_embeddings,
    listing_embeddings,
    test_reviews,
    test_listings,
    test_listings2dict,
    reverse_test_listings2dict,
    test_reviewers2dict,
    reverse_test_reviewers2dict,
    K=10,
):

    u2i_recommendations, u2i_ground_truths = prepare_evaluation_pairs(
        RECOMMENDENTATION_TYPE.USER_TO_ITEM.value,
        test_reviews,
        test_listings,
        user_embeddings,
        listing_embeddings,
        test_listings2dict,
        reverse_test_listings2dict,
        test_reviewers2dict,
        reverse_test_reviewers2dict,
        K,
    )
    u2i_hit_rate = hit_rate(u2i_recommendations, u2i_ground_truths)
    i2i_hit_rate = None
#     i2i_recommendations, i2i_ground_truths = prepare_evaluation_pairs(
#         RECOMMENDENTATION_TYPE.ITEM_TO_ITEM.value,
#         test_reviews,
#         test_listings,
#         user_embeddings,
#         listing_embeddings,
#         test_listings2dict,
#         reverse_test_listings2dict,
#         test_reviewers2dict,
#         reverse_test_reviewers2dict,
#         K,
#     )
#     i2i_hit_rate = hit_rate(i2i_recommendations, i2i_ground_truths)

    return u2i_hit_rate, i2i_hit_rate


In [57]:
u2i_hit_rate = evaluate_nn(
        user_embeddings,
        listing_embeddings,
        test_reviews,
        test_listings,
        test_listings2dict,
        reverse_test_listings2dict,
        test_reviewers2dict,
        reverse_test_reviewers2dict,
)
u2i_hit_rate

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185


KeyboardInterrupt: 

In [58]:
cos_t = torch.nn.CosineSimilarity(dim=1)(user_embeddings[0], listing_embeddings)
top_ids = torch.argsort(-cos_t).numpy()
print(cos_t)
print(-cos_t)
top_ids

tensor([0.1113, 0.1113, 0.1113,  ..., 0.1113, 0.1113, 0.1113],
       grad_fn=<SumBackward1>)
tensor([-0.1113, -0.1113, -0.1113,  ..., -0.1113, -0.1113, -0.1113],
       grad_fn=<NegBackward0>)


array([13881, 16971, 10351, ..., 16445, 17744,   746])

In [None]:
i2i_hit_rate = evaluate_nn(
        user_embeddings,
        listing_embeddings,
        test_reviews,
        test_listings,
        test_listings2dict,
        reverse_test_listings2dict,
        test_reviewers2dict,
        reverse_test_reviewers2dict,
)
i2i_hit_rate

8940811.0