In [1]:
import wandb
import pandas as pd
from loguru import logger
import torch
import torch.nn.functional as F
from datetime import datetime as dt
import os
from dateutil.relativedelta import relativedelta  # type: ignore
import functools

from process_data import *
from constants import *
from unsup_model import *
from evaluate import *

def get_entity2dict(df, id_col):
    entity2dict = {}

    for idx, _id in enumerate(df[id_col].to_list()):
        entity2dict[_id] = idx

    return entity2dict

wandb.login()

start_date = dt.strptime("2021-10-24", "%Y-%m-%d").date()
if start_date == dt.strptime(MAX_START_DATE, "%Y-%m-%d").date():
    raise Exception("Stop Simulation")
end_date, nxt_start_date = split_date_by_period_months(start_date, TOTAL_MONTHS_PER_ITERATION)
print(start_date, end_date)
directory = "/Users/yhchan/Downloads/FYP/data/processed"
reviews = pd.read_parquet(f"{directory}/reviews_with_interactions.parquet")
listings = pd.read_parquet(f"{directory}/listings_with_interactions.parquet")

config = {
        "architecture": "Unsupervised GraphSAGE",
        "start_date": start_date,
        "end_date": end_date,
        "learning_rate": 0.01,
        "hidden_channels": 64,
        "train_batch_size": 128,
        "test_batch_size": 128,
        "epochs": 50,
        "train_num_neighbours": [10, 10],
        "test_num_neighbours": [-1],
        "train_split_period_months": 10,
        "total_months_of_data": TOTAL_MONTHS_PER_ITERATION,
        "rec_K":10

    }

wandb.init(
    project=PROJECT_NAME,
    config=config,
)
wandb.define_metric("train_loss", step_metric="epoch", summary="min")
wandb.define_metric("test_loss", step_metric="epoch", summary="min")

# Split into train, test and test for cold start scenario
(
    train_reviews,
    train_listings,
    train_reviewers,
    test_reviews,
    test_listings,
    test_reviewers,
) = main_train_test(
    reviews,
    listings,
    start_date,
    end_date,
    config["train_split_period_months"],
)

cold_start_test_reviews = filter_test_data_by_scenario(
    train_reviews, test_reviews, "reviewer_id", "cold_start_new_user"
)
cold_start_test_listings, cold_start_test_reviewers = build_partitioned_data(
    cold_start_test_reviews, listings
)
# Build Graph
involved_reviews = pd.concat([train_reviews, test_reviews])
involved_listings, involved_reviewers = build_partitioned_data(involved_reviews, listings)
involved_data = build_heterograph(involved_reviews, involved_listings, involved_reviewers, True)
train_data = build_heterograph(train_reviews, train_listings, train_reviewers, True)
test_data = build_heterograph(test_reviews, test_listings, test_reviewers, True)
cold_start_test_data = build_heterograph(cold_start_test_reviews, cold_start_test_listings, cold_start_test_reviewers, True)
print("Whole Graph", involved_data)
print("Training Heterogenous Graph", train_data)
print("Test Heterogenous Graph", test_data)
print("Test Heterogenous Graph (Cold Start Scenerio)", cold_start_test_data)

involved_listings2dict = get_entity2dict(involved_listings, "listing_id")
reverse_involved_listings2dict = {k: v for v, k in involved_listings2dict.items()}

metadata_dict = {
    "num_reviews": len(involved_reviews),
    "num_train_reviews": len(train_reviews),
    "num_test_reviews": len(test_reviews),
    "num_cold_start_test_reviews":len(cold_start_test_reviews),
    
    "num_unique_listings": len(involved_listings),
    "num_unique_train_listings": len(train_listings),
    "num_unique_test_listings": len(test_listings),
    "num_unique_cold_start_test_listings":len(cold_start_test_listings),
    
    "num_unique_reviewers": len(involved_reviewers),
    "num_unique_train_reviewers": len(train_reviewers),
    "num_unique_test_reviewers": len(test_reviewers),
    "num_unique_cold_start_test_reviewers":len(cold_start_test_reviewers),

}

wandb.log(metadata_dict)
train_reviews.to_parquet("train/train_reviews.parquet", index=False)
train_listings.to_parquet("train/train_listings.parquet", index=False)
train_reviewers.to_parquet("train/train_reviewers.parquet", index=False)
test_reviews.to_parquet("test/test_reviews.parquet", index=False)
test_listings.to_parquet("test/test_listings.parquet", index=False)
test_reviewers.to_parquet("test/test_reviewers.parquet", index=False)
cold_start_test_reviews.to_parquet("test/cold_start_test_reviews.parquet", index=False)
cold_start_test_listings.to_parquet("test/cold_start_test_listings.parquet", index=False)
cold_start_test_reviewers.to_parquet("test/cold_start_test_reviewers.parquet", index=False)

dataset_art = wandb.Artifact(f"{start_date}_{end_date}_data", type="dataset")
for dir in ["train", "test"]:
    dataset_art.add_dir(dir)
wandb.log_artifact(dataset_art)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


2021-10-24 2022-10-23


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0167176583333332, max=1.0))…

2023-04-26 19:21:41.948 | INFO     | process_data:main_train_test:140 - Split df into train and test portion
  temp = torch.from_numpy(val).view(-1, 1).to(torch.float32)


Whole Graph HeteroData(
  [1mlisting[0m={ x=[18523, 158] },
  [1muser[0m={ x=[394551, 384] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 408596],
    edge_label=[408596],
    edge_label_index=[2, 408596]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 408596] }
)
Training Heterogenous Graph HeteroData(
  [1mlisting[0m={ x=[17229, 158] },
  [1muser[0m={ x=[324135, 384] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 334678],
    edge_label=[334678],
    edge_label_index=[2, 334678]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 334678] }
)
Test Heterogenous Graph HeteroData(
  [1mlisting[0m={ x=[14380, 158] },
  [1muser[0m={ x=[72447, 384] },
  [1m(user, rates, listing)[0m={
    edge_index=[2, 73918],
    edge_label=[73918],
    edge_label_index=[2, 73918]
  },
  [1m(listing, rev_rates, user)[0m={ edge_index=[2, 73918] }
)
Test Heterogenous Graph (Cold Start Scenerio) HeteroData(
  [1mlisting[0m={ x=[14254, 158] },
  [1muser

[34m[1mwandb[0m: Adding directory to artifact (./train)... Done. 5.1s
[34m[1mwandb[0m: Adding directory to artifact (./test)... Done. 2.9s


<wandb.sdk.wandb_artifacts.Artifact at 0x336c4ed60>

In [2]:
# Modelling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = train_data.to(device)
train_loader = prepare_data_loader(
    data=train_data,
    batch_size=config["train_batch_size"],
    num_neighbours=config["train_num_neighbours"],
)
test_loader = prepare_data_loader(
    data=test_data,
    batch_size=config["test_batch_size"],
    num_neighbours=config["test_num_neighbours"],
)
model = Unsupervised_Model(hidden_channels=config["hidden_channels"], data=involved_data).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

def train():
    model.train(True)
    total_loss = 0
    # Why using mini-batch gradient descent
    # Update NN multiple times every epoch, Make more precise update to the parameters by calculating the average loss in each step
    # Reduce overall training time and num of required epochs for reaching convergence, computational efficiency
    for batch in train_loader:
        batch = batch.to(device)
        # Zero gradients for every batch
        optimizer.zero_grad()
        # Make predictions for this batch
        h = model(batch.x_dict, batch.edge_index_dict)
        h_src = h["user"][batch["user", "listing"].edge_label_index[0]]
        h_dst = h["listing"][batch["user", "listing"].edge_label_index[1]]
        pred = (h_src * h_dst).sum(dim=-1)
        # Compute the loss and its gradients
        loss = F.binary_cross_entropy_with_logits(pred, batch["user", "listing"].edge_label)
        loss.backward()
        # Adjust learning weights
        optimizer.step()
        total_loss += float(loss) * pred.size(0)

    train_loss = total_loss / train_data.num_nodes
    return train_loss

@torch.no_grad()
def test(test_data_loader, test_data, model):
    model.eval()
    total_loss = 0
    for batch in test_data_loader:
        batch = batch.to(device)
        # Make predictions for this batch
        h = model(batch.x_dict, batch.edge_index_dict)
        h_src = h["user"][batch["user", "listing"].edge_label_index[0]]
        h_dst = h["listing"][batch["user", "listing"].edge_label_index[1]]
        pred = (h_src * h_dst).sum(dim=-1)
        # Compute the loss and its gradients
        loss = F.binary_cross_entropy_with_logits(pred, batch["user", "listing"].edge_label)
        total_loss += float(loss) * pred.size(0)

    test_loss = total_loss / test_data.num_nodes
    return test_loss


best_train_loss = float("inf")
best_test_loss = float("inf")
model_prefix = "./unsupervised_models"

# Train and Evaluate Loss
test_wrapper = functools.partial(test, test_loader, test_data)
for epoch in range(1, wandb.config["epochs"] + 1):
    model_is_best = False
    train_loss = train()
    test_loss = test_wrapper(model)

    if train_loss < best_train_loss:
        wandb.run.summary["best_train_loss"] = train_loss
        best_train_loss = train_loss

    if test_loss < best_test_loss:
        wandb.run.summary["best_test_loss"] = test_loss
        best_test_loss = test_loss
        model_is_best = True

    metrics_dict = {
        "train_loss": train_loss,
        "test_loss": test_loss,
        "epoch": epoch,
    }
    wandb.log(metrics_dict)
    logger.info(
        f"Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f} "
    )
    
    model_path = f"{model_prefix}/{epoch}_model_state_dict.pt"
    torch.save(model.state_dict(), model_path)
    model_art = wandb.Artifact(f"{MODEL_NAME}_epoch_epoch", type="model")
    model_art.add_file(model_path)
    wandb.log_artifact(
        model_art,
        aliases=[
            "BEST",
        ]
        if model_is_best
        else None,
    )
        
logger.info("End of Training")


2023-04-26 19:22:54.819 | INFO     | __main__:<module>:87 - Epoch: 001, Train Loss: 2.0640, Test Loss: 0.6423 
2023-04-26 19:23:21.718 | INFO     | __main__:<module>:87 - Epoch: 002, Train Loss: 0.6083, Test Loss: 0.5620 
2023-04-26 19:23:47.234 | INFO     | __main__:<module>:87 - Epoch: 003, Train Loss: 1.0001, Test Loss: 1.1732 
2023-04-26 19:24:12.500 | INFO     | __main__:<module>:87 - Epoch: 004, Train Loss: 1.0541, Test Loss: 1.7503 
2023-04-26 19:24:40.314 | INFO     | __main__:<module>:87 - Epoch: 005, Train Loss: 1.0036, Test Loss: 1.8673 
2023-04-26 19:25:05.544 | INFO     | __main__:<module>:87 - Epoch: 006, Train Loss: 1.0052, Test Loss: 1.7289 
2023-04-26 19:25:34.652 | INFO     | __main__:<module>:87 - Epoch: 007, Train Loss: 1.0057, Test Loss: 1.5686 
2023-04-26 19:26:02.474 | INFO     | __main__:<module>:87 - Epoch: 008, Train Loss: 1.0051, Test Loss: 1.3469 
2023-04-26 19:26:27.570 | INFO     | __main__:<module>:87 - Epoch: 009, Train Loss: 1.3591, Test Loss: 1.3173 
2

KeyboardInterrupt: 