In [None]:
import os
from collections import Counter
import numpy as np
import pandas as pd
from scipy import interp
import networkx as nx
from sklearn import metrics
from statistics import mean
from sklearn.preprocessing import MinMaxScaler
import logging
import warnings
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import textwrap
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap, BoundaryNorm, LinearSegmentedColormap
import numpy as np
from sklearn.decomposition import PCA
from matplotlib.cm import ScalarMappable


from IPython.display import HTML, display
import tabulate

import sys
sys.path.append("../src/models/")
import tgn_viz
import tgn
from src.tgn_viz.utils.data_processing import Data

logging.getLogger('matplotlib.font_manager').disabled = True

sns.set_theme()
sns.set_style("whitegrid", {"grid.color": ".6", "grid.linestyle": ":"})


warnings.filterwarnings("ignore", category=DeprecationWarning) 

colors = ["#346178",  #blue 52,97,120
          "#FC611F", #orange 252,97,31
          "#5C6547", #green
         "#FDC10E", #yellow
         "#8EC3D9", #light blue
         "#EA99A2", # pink
         "#A65C7B", 
         "#5CA69A" # teal
         ]
customPalette = sns.set_palette(sns.color_palette(colors))

def inter_from_256(x):
    return np.interp(x=x,xp=[0,255],fp=[0,1])

rgb_list = [[52,97,120], [92, 166, 154], [252,97,31], [253, 193, 14]]
all_red = []
all_green = []
all_blue = []
for rgb in rgb_list:
    all_red.append(rgb[0])
    all_green.append(rgb[1])
    all_blue.append(rgb[2])
# build each section
n_section = len(all_red) - 1
red = tuple([(1/n_section*i,inter_from_256(v),inter_from_256(v)) for i,v in enumerate(all_red)])
green = tuple([(1/n_section*i,inter_from_256(v),inter_from_256(v)) for i,v in enumerate(all_green)])
blue = tuple([(1/n_section*i,inter_from_256(v),inter_from_256(v)) for i,v in enumerate(all_blue)])
cdict = {'red':red,'green':green,'blue':blue}
new_cmap = LinearSegmentedColormap('new_cmap',segmentdata=cdict)

# General statistics

In [None]:
def degree_distribution(dataset_name):
    filename = dataset_name.lower()
    path = f"../data/interim/{filename}/ml_{filename}.csv"
    df = pd.read_csv(path, index_col=0)
    G = nx.Graph()
    G.add_nodes_from(set(df.u.tolist()), bipartite=0)
    G.add_nodes_from(set(df.i.tolist()), bipartite=1)
    G.add_edges_from(df[['u', 'i']].values)

    A = set(df.u.tolist())
    B = set(df.i.tolist())
    plt.figure(figsize=(10, 7)) 

    degrees_A = [G.degree()[x] for x in A]
    counts_a = Counter(degrees_A)
    hist_a = [counts_a.get(i, 0) for i in range(max(counts_a) + 1)]
    degrees = range(len(hist_a))
    plt.loglog(degrees, hist_a,'o-', color=[52/255,97/255,120/255], alpha=0.7) 

    degrees_B = [G.degree()[x] for x in B]
    counts_b = Counter(degrees_B)
    hist_b = [counts_b.get(i, 0) for i in range(max(counts_b) + 1)]
    degrees = range(len(hist_b))
    plt.loglog(degrees, hist_b,'o-', color=[252/255,97/255,31/255], alpha=0.7) 

    plt.xlabel('Degree', fontsize=15)
    plt.ylabel('Frequency', fontsize=15)
    plt.legend(["user", "item"], prop={'size': 15})
    plt.title(f"Degree distribution of {dataset_name}", fontsize=15)
    plt.show()

In [None]:
names = ['LastFM', 'MOOC', 'Reddit', 'Wikipedia']
for name in names:
    degree_distribution(name)

In [None]:
# distinct degrees in time
def distinct_degrees_time_plot(dataset_name):
    alphas = {'LastFM':[0.02,0.3,0.02, 0.3],
              'MOOC':[0.05,0.3,0.005, 0.3],
              'Reddit':[0.02,0.3,0.015, 0.4],
              'Wikipedia':[0.02,0.3,0.015, 0.4]}
    
    filename = dataset_name.lower()
    path = f"../data/interim/{filename}/ml_{filename}.csv"
    df = pd.read_csv(path, index_col=0)

    fig, [ax_1, ax_2] = plt.subplots(1,2, figsize=(12,4))

    base_points = np.linspace(df.ts.min(), df.ts.max(), 1001)

    funs_b = []
    b = set(df.i.tolist())
    for user in tqdm(b):
        d = df[df.i == user].drop_duplicates(subset=['u','i'], keep='first', ignore_index=True).reset_index()[['index', 'ts']]
        if d.shape[0] > 0:
            ax_1.plot(d.ts, d.index, alpha=alphas[dataset_name][0], color=[252/255,97/255,31/255])
            fun = np.interp(base_points, d.ts.values, d.index.values)
            fun[0] = 0.0
            funs_b.append(fun)

    funs_b = np.array(funs_b)
    mean_funs_b = funs_b.mean(axis=0)
    std = funs_b.std(axis=0)

    funs_b_upper = mean_funs_b + std
    funs_b_lower = np.maximum(mean_funs_b - std,0)

    ax_1.plot(base_points, mean_funs_b, color=[206/255,91/255,45/255], lw=2)
    ax_1.fill_between(base_points, funs_b_lower, funs_b_upper, color=[252/255,97/255,31/255], alpha=alphas[dataset_name][1])

    ax_1.set_title(f"Degree development of item nodes\n ({dataset_name})", fontsize=14)
    #######################
    funs_a=[]
    a = set(df.u.tolist())

    for user in tqdm(a):
        d = df[df.u == user].drop_duplicates(subset=['u','i'], keep='first', ignore_index=True).reset_index()[['index', 'ts']]
        if d.shape[0] > 0:
            ax_2.plot(d.ts, d.index, alpha=alphas[dataset_name][2], color=[52/255,97/255,120/255])
            fun = np.interp(base_points, d.ts.values, d.index.values)
            fun[0] = 0.0
            funs_a.append(fun)

    funs_a = np.array(funs_a)
    mean_funs_a = funs_a.mean(axis=0)
    std = funs_a.std(axis=0)

    funs_a_upper = mean_funs_a + std
    funs_a_lower = np.maximum(mean_funs_a - std,0)

    ax_2.plot(base_points, mean_funs_a, color=[52/255,97/255,120/255], lw=2)
    ax_2.fill_between(base_points, funs_a_lower, funs_a_upper, color=[143/255,195/255,216/255], alpha=alphas[dataset_name][3])

    ax_2.set_title(f"Degree development of user nodes\n ({dataset_name})", fontsize=14)

    plt.savefig(f"../reports/figures/distinct_degrees_time_plot_{dataset_name}.png")
    plt.show()

In [None]:
names = ['LastFM', 'MOOC', 'Reddit', 'Wikipedia']
for name in names:
    distinct_degrees_time_plot(name)

In [None]:
# distinct degrees in time
def degrees_time_plot(dataset_name):
    alphas = {'LastFM':[0.025,0.3,0.04, 0.3],
              'MOOC':[0.06,0.3,0.01, 0.4],
              'Reddit':[0.03,0.3,0.02, 0.4],
              'Wikipedia':[0.03,0.3,0.02, 0.4]}
    
    filename = dataset_name.lower()
    path = f"../data/interim/{filename}/ml_{filename}.csv"
    df = pd.read_csv(path, index_col=0)

    fig, [ax_1, ax_2] = plt.subplots(1,2, figsize=(12,4))

    base_points = np.linspace(df.ts.min(), df.ts.max(), 1001)

    funs_b = []
    b = set(df.i.tolist())
    for user in tqdm(b):
        d = df[df.i == user].reset_index()[['index', 'ts']]
        if d.shape[0] > 0:
            ax_1.plot(d.ts, d.index, alpha=alphas[dataset_name][0], color=[252/255,97/255,31/255])
            fun = np.interp(base_points, d.ts.values, d.index.values)
            fun[0] = 0.0
            funs_b.append(fun)

    funs_b = np.array(funs_b)
    mean_funs_b = funs_b.mean(axis=0)
    std = funs_b.std(axis=0)

    funs_b_upper = mean_funs_b + std
    funs_b_lower = np.maximum(mean_funs_b - std,0)

    ax_1.plot(base_points, mean_funs_b, color=[206/255,91/255,45/255], lw=2)
    ax_1.fill_between(base_points, funs_b_lower, funs_b_upper, color=[252/255,97/255,31/255], alpha=alphas[dataset_name][1])

    ax_1.set_title(f"Degree development of item nodes\n ({dataset_name}, including duplicate edges)", fontsize=14)
    #######################
    funs_a=[]
    a = set(df.u.tolist())

    for user in tqdm(a):
        d = df[df.u == user].reset_index()[['index', 'ts']]
        if d.shape[0] > 0:
            ax_2.plot(d.ts, d.index, alpha=alphas[dataset_name][2], color=[52/255,97/255,120/255])
            fun = np.interp(base_points, d.ts.values, d.index.values)
            fun[0] = 0.0
            funs_a.append(fun)

    funs_a = np.array(funs_a)
    mean_funs_a = funs_a.mean(axis=0)
    std = funs_a.std(axis=0)

    funs_a_upper = mean_funs_a + std
    funs_a_lower = np.maximum(mean_funs_a - std,0)

    ax_2.plot(base_points, mean_funs_a, color=[52/255,97/255,120/255], lw=2)
    ax_2.fill_between(base_points, funs_a_lower, funs_a_upper, color=[143/255,195/255,216/255], alpha=alphas[dataset_name][3])

    ax_2.set_title(f"Degree development of user nodes\n ({dataset_name}, all edges)", fontsize=14)

    plt.savefig(f"../reports/figures/degrees_time_plot_{dataset_name}.png")
    plt.show()

In [None]:
names = ['LastFM', 'MOOC', 'Reddit', 'Wikipedia']
for name in names:
    degrees_time_plot(name)

In [None]:
# number of nodes/edges, unique edges/nodes, edges, density

In [None]:
def graph_statistics(dataset_name):
    filename = dataset_name.lower()
    path = f"../data/interim/{filename}/ml_{filename}.csv"
    df = pd.read_csv(path, index_col=0)

    unique_edges = df[['u', 'i']].drop_duplicates().shape[0]
    all_edges = df[['u', 'i']].shape[0]
    unique_nodes = df.u.append(df.i).unique().shape[0]
    unique_user_nodes = df.u.unique().shape[0]
    unique_item_nodes = df.i.unique().shape[0]

    average_edge_duplication = all_edges / unique_edges

    all_possible_edges = unique_user_nodes * unique_item_nodes
    
    edge_repetition = 1 - (unique_edges/all_edges)

    density = unique_edges / all_possible_edges

    table = [[f"{dataset_name} statistics"],
             ["Nodes", "User nodes", "Item nodes", "Unique edges",
              "All edges","Edge repetition", "Avergae edge duplication ratio", "All possible edges", "Density"],
             [unique_nodes,unique_user_nodes,unique_item_nodes, unique_edges,
              all_edges,edge_repetition,average_edge_duplication, all_possible_edges,density]]

    display(HTML(tabulate.tabulate(table, tablefmt='html')))
    print()


In [None]:
names = ['LastFM', 'MOOC', 'Reddit', 'Wikipedia']
for name in names:
    graph_statistics(name)

# Experiments

In [None]:
def plot_results(test, data_name):
    col_names = {"TGN \n(no time)": ['tgn_tgn_ablation_time_pred_run_0', 'tgn_tgn_ablation_time_pred_run_1',
                                               'tgn_tgn_ablation_time_pred_run_2', 'tgn_tgn_ablation_time_pred_run_3',
                                               'tgn_tgn_ablation_time_pred_run_4'],
                "TGN": ['tgn_tgn_pred_run_0',
                       'tgn_tgn_pred_run_1', 'tgn_tgn_pred_run_2', 'tgn_tgn_pred_run_3',
                       'tgn_tgn_pred_run_4'],
                "Jodie \n(no time)": ['jodie_jodie_ablation_time_pred_run_0',
                                               'jodie_jodie_ablation_time_pred_run_1',
                                               'jodie_jodie_ablation_time_pred_run_2',
                                               'jodie_jodie_ablation_time_pred_run_3',
                                               'jodie_jodie_ablation_time_pred_run_4'],
                "Jodie": ['jodie_jodie_pred_run_0',
                           'jodie_jodie_pred_run_1', 'jodie_jodie_pred_run_2',
                           'jodie_jodie_pred_run_3', 'jodie_jodie_pred_run_4'],
                "DyRep \n(no time)": ['dyrep_dyrep_ablation_time_pred_run_0',
                                               'dyrep_dyrep_ablation_time_pred_run_1',
                                               'dyrep_dyrep_ablation_time_pred_run_2',
                                               'dyrep_dyrep_ablation_time_pred_run_3',
                                               'dyrep_dyrep_ablation_time_pred_run_4'],
                "DyRep": ['dyrep_dyrep_pred_run_0',
                           'dyrep_dyrep_pred_run_1', 'dyrep_dyrep_pred_run_2',
                           'dyrep_dyrep_pred_run_3', 'dyrep_dyrep_pred_run_4'],
                "Pref. \nAttach.":['pa'],
                "Neigh. \nMeasure":['nm'],
                "Katz":['ra'],
                "Algebraic \nDist.":['dist']
                #"Common Ego Edges": ['cee']
                }
    
    rocauc_scores = pd.DataFrame(columns=["model", "auc"])
    prauc_scores = pd.DataFrame(columns=["model", "auc"])
    
    df = test.copy()
    
    for col_pred in ['pa', 'nm', 'ra', 'dist']:
        scaler = MinMaxScaler()
        df[col_pred] = scaler.fit_transform(test[[col_pred]])
        df[col_pred].fillna(0, inplace=True)

    
    label = test.ground_truth.values
    for title, col in col_names.items():
        fig, [ax_1, ax_2] = plt.subplots(1,2, figsize=(12,4))
        tprs = []
        base_fpr = np.linspace(0, 1, 101)
        auc=[]
        for pred in col:
            mask = (~df[pred].isna())
            print(f"Skipping {df[df[pred].isna()].shape[0]} new edges...")
            pred_prob = df[mask][pred].values
            label_col = df[mask].ground_truth.values
            fpr, tpr, thresh = metrics.roc_curve(label_col,
                                                 pred_prob)
            ax_1.plot(fpr, tpr, color=[52/255,97/255,120/255], alpha=0.15)
            a = metrics.roc_auc_score(label_col, pred_prob)
            auc.append(a)
            rocauc_scores = rocauc_scores.append({"model":title, "auc":a}, ignore_index=True)
            
            tpr = interp(base_fpr, fpr, tpr)
            tpr[0] = 0.0
            tprs.append(tpr)
        tprs = np.array(tprs)
        mean_tprs = tprs.mean(axis=0)
        std = tprs.std(axis=0)
            
        tprs_upper = np.minimum(mean_tprs + std, 1)
        tprs_lower = mean_tprs - std
        
        ax_1.plot(base_fpr, mean_tprs, color=[52/255,97/255,120/255], lw=2)
        ax_1.fill_between(base_fpr, tprs_lower, tprs_upper, color=[143/255,195/255,216/255], alpha=0.3)

        ax_1.plot([0, 1], [0, 1],'--', color = [252/255,97/255,31/255])
        ax_1.set_xlim([-0.01, 1.01])
        ax_1.set_ylim([-0.01, 1.01])
        ax_1.set_ylabel('True Positive Rate')
        ax_1.set_xlabel('False Positive Rate')
        # plt.axes().set_aspect('equal', 'datalim')
        ax_1.set_title(f"{title} ROC curve, AUC: {mean(auc):.4f}")

        # PR curve
        tprs = []
        base_fpr = np.linspace(0, 1, 101)
        auc=[]
        for pred in col:
            mask = (~df[pred].isna())
            print(f"Skipping {df[df[pred].isna()].shape[0]} new edges...")
            pred_prob = df[mask][pred].values
            label_col = df[mask].ground_truth.values
            precision, recall, thresholds = metrics.precision_recall_curve(label_col, pred_prob)
            a = metrics.auc(recall, precision)
            auc.append(a)
            prauc_scores = prauc_scores.append({"model":title, "auc":a}, ignore_index=True)
            
            ax_2.plot(recall, precision, color=[52/255,97/255,120/255], alpha=0.15)
            
            reversed_recall = np.fliplr([recall])[0]
            reversed_precision = np.fliplr([precision])[0]
            tpr = interp(base_fpr, reversed_recall, reversed_precision)
            tpr[0] = 1.0
            tprs.append(tpr)
        tprs = np.array(tprs)
        mean_tprs = tprs.mean(axis=0)
        std = tprs.std(axis=0)
            
        tprs_upper = np.minimum(mean_tprs + std, 1)
        tprs_lower = mean_tprs - std
        
        ax_2.plot(base_fpr, mean_tprs, color=[52/255,97/255,120/255], lw=2)
        ax_2.fill_between(base_fpr, tprs_lower, tprs_upper, color=[143/255,195/255,216/255], alpha=0.3)
        ax_2.plot([0, 1], [0.5, 0.5],'--', color = [252/255,97/255,31/255])
        ax_2.set_xlim([-0.01, 1.01])
        ax_2.set_ylim([-0.01, 1.01])
        ax_2.set_xlabel('Recall')
        ax_2.set_ylabel('Precision')
        ax_2.set_title(f"{title} PR curve, AUC: {mean(auc):.4f}")
        plt.show()
        
        # display(auc_scores)

    max_width = 10
    
    plt.figure(figsize = (7,5))
    ax = sns.boxplot(x="model", y="auc",
                    data=rocauc_scores,
                    linewidth=2.5,
                    color=[52/255,97/255,120/255],
                    width=0.5)
    ax.plot([-0.5, 9.5], [0.5, 0.5],'--', color = [252/255,97/255,31/255])
    # ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
    # ax.set_xticklabels(textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels())
    ax.set_xlabel("Model")
    ax.set_ylabel("AUC")
    plt.title(f"ROC AUC scores for {data_name}",fontsize=14)
    plt.savefig(f"../reports/figures/models_roc_auc_{data_name}.png")
    plt.show()
    
    plt.figure(figsize = (7,5))
    ax = sns.boxplot(x="model", y="auc",
                    data=prauc_scores,
                    linewidth=2.5,
                    color=[52/255,97/255,120/255],
                    width=0.5)
    ax.plot([-0.5, 9.5], [0.5, 0.5],'--', color = [252/255,97/255,31/255])
    # ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
    ax.set_xlabel("Model")
    ax.set_ylabel("AUC")
    plt.title(f"PR AUC scores for {data_name}",fontsize=14)
    plt.savefig(f"../reports/figures/models_pr_auc_{data_name}.png")
    plt.show()
        
    return df

In [None]:
for data_name in ['lastfm', 'mooc', 'wikipedia', 'reddit']:
    print(data_name)
    test = pd.read_csv(os.path.join("../data/processed", f"{data_name}/test/ml_{data_name}.csv"), index_col=0)
    
    # add embedding methods
    for root, dirs, files in os.walk(f"../data/results/{data_name}"):
        if files:
            for file in files:
                if "pred" in file:
                    test[os.path.basename(file).split('.')[0]] = np.load(os.path.join(root, file))
                    
    # add static methods
    test_static = pd.read_csv(f"../data/results/{data_name}/static_result_{data_name}.csv", index_col=0)
    test[['pa', 'nm', 'ra', 'dist']] = test_static[['pa', 'nm', 'ra', 'dist']]
    
    # test_cee = pd.read_csv(f"../data/results/{data_name}/static_result_cee_{data_name}.csv", index_col=0)
    # test[['cee']] = test_cee[['cee']]
    
    df = plot_results(test, data_name)

# Learning curve

In [None]:
def plot_learning_curve(dataset_name):
    col_names = {
                "tgn": ['tgn_tgn_pred_run_0',
                       'tgn_tgn_pred_run_1', 'tgn_tgn_pred_run_2', 'tgn_tgn_pred_run_3',
                       'tgn_tgn_pred_run_4'],
                "jodie": ['jodie_jodie_pred_run_0',
                           'jodie_jodie_pred_run_1', 'jodie_jodie_pred_run_2',
                           'jodie_jodie_pred_run_3', 'jodie_jodie_pred_run_4'],
                "dyrep": ['dyrep_dyrep_pred_run_0',
                           'dyrep_dyrep_pred_run_1', 'dyrep_dyrep_pred_run_2',
                           'dyrep_dyrep_pred_run_3', 'dyrep_dyrep_pred_run_4']
                }
    
    path = "../data/results"
    scores = pd.DataFrame(columns=['folder', 'model', 'rocauc', 'prauc'])
    folders = [x for x in os.listdir(path)if x[:len(f"{dataset_name}_")] == f"{dataset_name}_"]
    for folder in folders:
        full_path = os.path.join(path, folder)
        test = pd.read_csv(f"../data/processed/split_data/{dataset_name}/{folder}/test/ml_{dataset_name}.csv")
        for model in os.listdir(full_path):
            for pred_file in col_names[model]:
                if 'pred' in pred_file:
                    pred_path = os.path.join(full_path,model, pred_file)
                    if not os.path.isfile(pred_path + ".npy"):
                        print(f"{pred_path} not found")
                        continue
                    pred = np.load(pred_path + ".npy")
                    test[f"{folder}_{model}_{pred_file[-1]}"] = pred
                    label = test.ground_truth 
                    precision, recall, thresholds = metrics.precision_recall_curve(label,
                                                                                   pred)
                    prauc = metrics.auc(recall, precision)
                    fpr, tpr, thresh = metrics.roc_curve(label,
                                                 pred)
                    rocauc = metrics.roc_auc_score(label, pred)
                    scores = scores.append({'folder':folder,
                                            'model':model,
                                            'rocauc':rocauc,
                                            'prauc':prauc},
                                          ignore_index=True)
    f = sns.lineplot(x='folder', y='rocauc',
                     data=scores,
                     hue='model', hue_order = ["tgn", "jodie", "dyrep"],
                    palette = customPalette)
    plt.legend(['TGN', 'Jodie', 'DyRep'])
    plt.title(f"ROC AUC score development ({dataset_name})")
    f.set_xticklabels([f"{x}0%" for x in range(1,11)])
    plt.legend(loc='upper left')
    plt.ylabel("ROC AUC")
    plt.xlabel("Ratio of data used")
    plt.show()
    
    f = sns.lineplot(x='folder', y='prauc',
                     data=scores,
                     hue='model', hue_order = ["tgn", "jodie", "dyrep"],
                     palette = customPalette)
    plt.legend(['TGN', 'Jodie', 'DyRep'])
    plt.title(f"PR AUC score development ({dataset_name})")
    f.set_xticklabels([f"{x}0%" for x in range(1,11)])
    plt.legend(loc='upper left')
    plt.ylabel("PR AUC")
    plt.xlabel("Ratio of data used")
    plt.show()
    
    return scores

In [None]:
s = plot_learning_curve('wikipedia')

In [None]:
s = plot_learning_curve('reddit')

In [None]:
s = plot_learning_curve('mooc')

In [None]:
s = plot_learning_curve('lastfm')

# Node2vec comparison

In [None]:
scores = pd.DataFrame(columns=['dataset', 'model', 'score'])
scores['dataset'] = ['Facebook']*8 + ['PPI']*8 + ['arXiv']*8
scores['model'] = ['Common Neighbors', 'Jaccard’s Coefficient', 'Adamic-Adar ', 'Pref. Attachment', 'Spectral Clustering', 'DeepWalk', 'LINE', 'node2vec']*3
scores['score'] = [0.8100,
                  0.8880,
                  0.8289,
                  0.7137,
                  0.7200,
                  0.9680,
                  0.9490,
                  0.9680,
                  0.7142,
                  0.7018,
                  0.7126,
                  0.6670,
                  0.6588,
                  0.7441,
                  0.7249,
                  0.7719,
                  0.8153,
                  0.8067,
                  0.8315,
                  0.6996,
                  0.7099,
                  0.9340,
                  0.8902,
                  0.9366]
scores

In [None]:
plt.figure(figsize = (6,6))
f = sns.catplot(x='dataset', y='score',
               data=scores,
               hue='model',
               palette=customPalette,
               kind='swarm',
               s=10)

plt.ylabel("AUC score")
plt.xlabel("Dataset name")
f._legend.set_title("Method")
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

# Jodie results

In [None]:
scores = pd.DataFrame(columns=['dataset', 'model', 'score'])
scores['dataset'] = ['Reddit']*6 + ['Wikipedia']*6 + ['MOOC']*6
scores['model'] = ['LSTM', 'TIME-LSTM', 'RNN', 'LatentCross', 'DeepCoevolve', 'JODIE']*3
scores['score'] = [0.523,0.556,0.586,0.574,0.577,0.599,0.575,0.671,0.804,0.628,0.663,0.831,0.686,0.711,0.558,0.686,0.671,0.756]

plt.figure(figsize = (6,6))
f = sns.catplot(x='dataset', y='score',
               data=scores,
               hue='model',
               palette=customPalette,
               kind='swarm',
               s=10)

plt.ylabel("AUC score")
plt.xlabel("Dataset name")
f._legend.set_title("Method")
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

# TGN results

In [None]:
scores = pd.DataFrame(columns=['dataset', 'model', 'score'])
scores['dataset'] = ['Wikipedia']*5 + ['Reddit']*5
scores['model'] = ['DeepWalk', 'Node2Vec', 'Jodie', 'DyRep', 'TGN']*2
scores['score'] = [90.71, 91.48, 94.62, 94.59, 98.46, 83.10, 84.58,97.11,97.98, 98.7]



plt.figure(figsize = (3,3))
f = sns.catplot(x='dataset', y='score',
               data=scores,
               hue='model',
               palette=customPalette,
               kind='swarm',
               s=10)

plt.ylabel("Average precision")
plt.xlabel("Dataset name")
f._legend.set_title("Method")
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig(f"../reports/figures/original_tgn_prec.png")
plt.show()

In [None]:
scores = pd.DataFrame(columns=['dataset', 'model', 'score'])
scores['dataset'] = ['Wikipedia']*3 + ['Reddit']*3
scores['model'] = ['Jodie', 'DyRep', 'TGN']*2
scores['score'] = [85.84,84.59,87.81, 61.83,62.91, 67.06]

plt.figure(figsize = (3,3))
f = sns.catplot(x='dataset', y='score',
               data=scores,
               hue='model',
               palette=customPalette,
               kind='swarm',
               s=10)

plt.ylabel("ROC AUC")
plt.xlabel("Dataset name")
f._legend.set_title("Method")
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig(f"../reports/figures/original_tgn_auc.png")
plt.show()

# Node embedding viz

In [None]:
# What to visulize?
data = pd.read_csv("../data/interim/mooc/ml_mooc.csv", index_col=0)
plt.hist(data.i.value_counts())

plt.show()
plt.hist(data.u.value_counts())
plt.show()

In [None]:
data[['u', 'i']].value_counts().sort_values(ascending=False)

In [None]:
d = data.i.value_counts()
mask = (d>2500) & (d<5000)
items = d[mask].index.tolist()

d = data.u.value_counts()
mask = (d>50) & (d<100)
users = d[mask].index.tolist()

display(data.query("u in  @users and i in @items"))
display(data.query("u in  @users or i in @items"))
display(data.query("u in  @users and i in @items")[['u', 'i']].value_counts().sort_values(ascending=False).head(10))

In [None]:

# best model 
test = pd.read_csv("../data/processed/mooc/test/ml_mooc.csv")
res = ["../data/results/mooc/tgn/tgn_tgn_pred_run_0.npy",
        "../data/results/mooc/tgn/tgn_tgn_pred_run_1.npy",
        "../data/results/mooc/tgn/tgn_tgn_pred_run_2.npy",
        "../data/results/mooc/tgn/tgn_tgn_pred_run_3.npy",
        "../data/results/mooc/tgn/tgn_tgn_pred_run_4.npy"]

scores = []
for r in res:
    pred = np.load(r)
    label = test.ground_truth
    scores.append(roc_auc_score(label, pred))
    print(roc_auc_score(label, pred))

print()
print(sum(scores)/len(scores))

In [None]:
# load best model
models_to_load = {0: "../models/mooc/tgn/mooc-tgn.pth"}
models_to_load_2 = {0: "../models/mooc/tgn/mooc-tgn_viz_extra_layers_had_tanh.pth"}
models_to_load_3 = {0: "../models/mooc/tgn/mooc-tgn_viz_extra_layers_extra6.pth"}


model = tgn_viz.predict("mooc", "tgn", ablation=None, seed=0,
            n_runs=5, n_epoch=50,
            data_path = f"../data",
            models_to_load=models_to_load,
            affinity_merge_layer="default",
            return_loaded_model=True,
            memory_dim = 172,
            node_feature_size = 172)

model2 = tgn_viz.predict("mooc", "tgn", ablation=None, seed=0,
            n_runs=5, n_epoch=50,
            data_path = f"../data",
            models_to_load=models_to_load_2,
            affinity_merge_layer="extra_layers_had_tanh",
            return_loaded_model=True,
            memory_dim = 2,
            node_feature_size = 2)

model3 = tgn_viz.predict("mooc", "tgn", ablation=None, seed=0,
            n_runs=5, n_epoch=50,
            data_path = f"../data",
            models_to_load=models_to_load_3,
            affinity_merge_layer="extra_layers_extra6",
            return_loaded_model=True,
            memory_dim = 2,
            node_feature_size = 2)

memory1 = model.memory.backup_memory()
memory2 = model2.memory.backup_memory()
memory3 = model3.memory.backup_memory()

In [None]:
from src.tgn.utils.utils import get_neighbor_finder
dataset_name = "mooc"

data_full_path = os.path.join("../data/interim/mooc")

full_graph_df = pd.read_csv(os.path.join(data_full_path, f"ml_{dataset_name}.csv"))
full_edge_features = np.load(os.path.join(data_full_path, f"ml_{dataset_name}.npy"))
full_node_features = np.load(os.path.join(data_full_path, f"ml_{dataset_name}_node.npy"))

full_sources = full_graph_df.u.values
full_destinations = full_graph_df.i.values
full_edge_idxs = full_graph_df.idx.values
full_labels = full_graph_df.label.values
full_timestamps = full_graph_df.ts.values

full_data = Data(full_sources,
                full_destinations,
                full_timestamps,
                full_edge_idxs,
                full_labels)

full_ngh_finder = get_neighbor_finder(full_data, False)
model.embedding_module.neighbor_finder = full_ngh_finder
model2.embedding_module.neighbor_finder = full_ngh_finder
model3.embedding_module.neighbor_finder = full_ngh_finder

In [None]:
# test model AUC
data_true = pd.read_csv("../data/processed/mooc/test/ml_mooc_true.csv", index_col=0)
data_true = data_true[data_true.i < 7135]
test_sources = data_true.u.values
test_destinations = data_true.i.values
test_edge_idxs = data_true.idx.values
test_labels = data_true.label.values
test_timestamps = data_true.ts.values

test_data_true = Data(test_sources,
                   test_destinations,
                   test_timestamps,
                   test_edge_idxs,
                   test_labels)

data_false = pd.read_csv("../data/processed/mooc/test/ml_mooc_false.csv", index_col=0)
data_false = data_false[data_false.i < 7135]
test_sources = data_false.u.values
test_destinations = data_false.i.values
test_edge_idxs = data_false.idx.values
test_labels = data_false.label.values
test_timestamps = data_false.ts.values

test_data_false = Data(test_sources,
                   test_destinations,
                   test_timestamps,
                   test_edge_idxs,
                   test_labels)

model.eval()

res_t = tgn_viz.predict_links(model, test_data_true, 10, batch_size=200, embed_size=172)
model.memory.restore_memory(memory1)
res_f = tgn_viz.predict_links(model, test_data_false, 10, batch_size=200, embed_size=172, update_with_positives=False)
pred_score = np.concatenate([res_t[0], res_f[0]])
true_label = np.concatenate([np.ones(res_t[0].shape[0]),
                            np.zeros(res_f[0].shape[0])])
print(roc_auc_score(true_label, pred_score))
model.memory.restore_memory(memory1)

model2.eval()
res_t = tgn_viz.predict_links(model2, test_data_true, 10, batch_size=200, embed_size=2)
model2.memory.restore_memory(memory2)
res_f = tgn_viz.predict_links(model2, test_data_false, 10, batch_size=200, embed_size=2, update_with_positives=False)
pred_score = np.concatenate([res_t[0], res_f[0]])
true_label = np.concatenate([np.ones(res_t[0].shape[0]),
                            np.zeros(res_f[0].shape[0])])
print(roc_auc_score(true_label, pred_score))
model2.memory.restore_memory(memory2)

model3.eval()

res_t = tgn_viz.predict_links(model3, test_data_true, 10, batch_size=200, embed_size=2)
model3.memory.restore_memory(memory3)
res_f = tgn_viz.predict_links(model3, test_data_false, 10, batch_size=200, embed_size=2, update_with_positives=False)
pred_score = np.concatenate([res_t[0], res_f[0]])
true_label = np.concatenate([np.ones(res_t[0].shape[0]),
                            np.zeros(res_f[0].shape[0])])
print(roc_auc_score(true_label, pred_score))
model3.memory.restore_memory(memory3)

In [None]:
node_pairs = np.array([[2134,  7066],
        [4857,  7066],
        [3958,  7058],
        [4309,  7058],
        [1046,  7058],
        [6419,  7079],
        [4861,  7079],
        [4386,  7082],
        [6680,  7082],
        [6629,  7075]])

In [None]:
# predict all

data = pd.read_csv("../data/interim/mooc/ml_mooc.csv", index_col=0)
test_sources = data.u.values
test_destinations = data.i.values
test_edge_idxs = data.idx.values
test_labels = data.label.values
test_timestamps = data.ts.values

test_data_true = Data(test_sources,
                   test_destinations,
                   test_timestamps,
                   test_edge_idxs,
                   test_labels)


In [None]:
model.memory.__init_memory__()
model2.memory.__init_memory__()
model3.memory.__init_memory__()

res = tgn_viz.predict_links(model, test_data_true, 10, batch_size=200, embed_size=172)

res2 = tgn_viz.predict_links(model2, test_data_true, 10, batch_size=200, embed_size=2)

res3 = tgn_viz.predict_links(model3, test_data_true, 10, batch_size=200, embed_size=2)

model.memory.restore_memory(memory1)
model2.memory.restore_memory(memory2)
model3.memory.restore_memory(memory3)

preds, (nodes, times, embeds) = res
preds2, (nodes2, times2, embeds2) = res2
preds3, (nodes3, times3, embeds3) = res3

sc = StandardScaler()
sc.fit(embeds)
X_train_std = sc.transform(embeds)
pca = PCA(n_components=2)
embed_pca = pca.fit_transform(X_train_std)

sc2 = StandardScaler()
sc2.fit(embeds2)
X_train_std2 = sc2.transform(embeds2)
pca2 = PCA(n_components=2)
embed_pca2 = pca2.fit_transform(X_train_std2)

sc3 = StandardScaler()
sc3.fit(embeds3)
X_train_std3 = sc3.transform(embeds3)
pca3 = PCA(n_components=2)
embed_pca3 = pca3.fit_transform(X_train_std3)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

sc = StandardScaler()
sc.fit(embeds)
X_train_std = sc.transform(embeds)
pca = PCA(n_components=4)
X_train_pca = pca.fit_transform(X_train_std)
exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)

plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio', fontsize=14)
plt.xlabel('Principal component index', fontsize=14)
plt.legend(loc='best')
plt.tight_layout()
plt.xticks(list(range(4)))
plt.show()

sc = StandardScaler()
sc.fit(embeds)
X_train_std = sc.transform(embeds)
pca = PCA(n_components=6)
X_train_pca = pca.fit_transform(X_train_std)
exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)

plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio', fontsize=14)
plt.xlabel('Principal component index', fontsize=14)
plt.legend(loc='best')
plt.tight_layout()
plt.xticks(list(range(6)))
plt.savefig(f"../reports/figures/tgn_172_pca.png")
plt.show()

## Plot node movement

In [None]:
y_min = min(times)
y_max = max(times)

plt.scatter(x=embed_pca[:,0],
            y=embed_pca[:,1],
            c=times,
            cmap=new_cmap,
            alpha=0.05)
cmap = plt.get_cmap(new_cmap)
norm = plt.Normalize(y_min,y_max)
sm =  ScalarMappable(norm=norm, cmap=cmap)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.show()

plt.scatter(x=embed_pca2[:,0],
            y=embed_pca2[:,1],
            c=times2,
            cmap=new_cmap,
            alpha=0.05)
cmap = plt.get_cmap(new_cmap)
norm = plt.Normalize(y_min,y_max)
sm =  ScalarMappable(norm=norm, cmap=cmap)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.show()

plt.scatter(x=embed_pca2[:,0],
            y=embed_pca2[:,1],
            c=times2,
            cmap=new_cmap,
            alpha=0.05)
cmap = plt.get_cmap(new_cmap)
norm = plt.Normalize(y_min,y_max)
sm =  ScalarMappable(norm=norm, cmap=cmap)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.show()

## Plot user-item pairs

In [None]:
node_pairs = np.array([[2134,  7066],
        [4857,  7066],
        [3958,  7058],
        [4309,  7058],
        [1046,  7058],
        [6419,  7079],
        [4861,  7079],
        [4386,  7082],
        [6680,  7082],
        [6629,  7075]])

In [None]:

data = pd.read_csv("../data/interim/mooc/ml_mooc.csv", index_col=0)
users = node_pairs[:,0]
items = node_pairs[:,1]
data = data.query("u in @users or i in  @items")

test_sources = data.u.values
test_destinations = data.i.values
test_edge_idxs = data.idx.values
test_labels = data.label.values
test_timestamps = data.ts.values

test_data_true = Data(test_sources,
                   test_destinations,
                   test_timestamps,
                   test_edge_idxs,
                   test_labels)
model.memory.__init_memory__()
model2.memory.__init_memory__()
model3.memory.__init_memory__()

res = tgn_viz.predict_links(model, test_data_true, 10, batch_size=200, embed_size=172)

res2 = tgn_viz.predict_links(model2, test_data_true, 10, batch_size=200, embed_size=2)

res3 = tgn_viz.predict_links(model3, test_data_true, 10, batch_size=200, embed_size=2)

preds, (nodes, times, embeds) = res
preds2, (nodes2, times2, embeds2) = res2
preds3, (nodes3, times3, embeds3) = res3

sc = StandardScaler()
sc.fit(embeds)
X_train_std = sc.transform(embeds)
pca = PCA(n_components=2)
embed_pca = pca.fit_transform(X_train_std)

sc2 = StandardScaler()
sc2.fit(embeds2)
X_train_std2 = sc2.transform(embeds2)
pca2 = PCA(n_components=2)
embed_pca2 = pca2.fit_transform(X_train_std2)

sc3 = StandardScaler()
sc3.fit(embeds3)
X_train_std3 = sc3.transform(embeds3)
pca3 = PCA(n_components=2)
embed_pca3 = pca3.fit_transform(X_train_std3)

In [None]:
y_min = min(times)
y_max = max(times)

plt.scatter(x=embed_pca[:,0],
            y=embed_pca[:,1],
            c=times,
            cmap=new_cmap,
            alpha=0.05)
cmap = plt.get_cmap(new_cmap)
norm = plt.Normalize(y_min,y_max)
sm =  ScalarMappable(norm=norm, cmap=cmap)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.show()

plt.scatter(x=embed_pca2[:,0],
            y=embed_pca2[:,1],
            c=times2,
            cmap=new_cmap,
            alpha=0.1)
cmap = plt.get_cmap(new_cmap)
norm = plt.Normalize(y_min,y_max)
sm =  ScalarMappable(norm=norm, cmap=cmap)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.show()

plt.scatter(x=embed_pca2[:,0],
            y=embed_pca2[:,1],
            c=times2,
            cmap=new_cmap,
            alpha=0.1)
cmap = plt.get_cmap(new_cmap)
norm = plt.Normalize(y_min,y_max)
sm =  ScalarMappable(norm=norm, cmap=cmap)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.show()

In [None]:
def plot_embeds(user, item, preds, nodes, times, embeds_pca, title):
    user_mask = nodes == user
    item_mask = nodes == item

    fig, [ax_1, ax_2] = plt.subplots(1,2, figsize=(12,4), sharex=True, sharey=True)

    y_min = min(times)
    y_max = max(times)

    a1 = sns.scatterplot(embeds_pca[user_mask,0],
                        embeds_pca[user_mask,1],
                        c=times[user_mask],
                        alpha=0.5,
                        ax=ax_1,cmap=new_cmap,
                        vmin=y_min,
                        vmax=y_max)

    a1.set_title(f"USER: {user}")
    a2 = sns.scatterplot(embeds_pca[item_mask,0],
                        embeds_pca[item_mask,1],
                        c=times[item_mask],
                        alpha=0.5,
                        ax=ax_2,
                        cmap=new_cmap,
                        vmin=y_min,
                        vmax=y_max)
    a2.set_title(f"ITEM: {item}")

    fig.suptitle(title)


    cmap = plt.get_cmap(new_cmap)
    norm = plt.Normalize(y_min,y_max)
    sm =  ScalarMappable(norm=norm, cmap=cmap)
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=ax_2)
    plt.savefig(f"../reports/figures/{'_'.join(title.lower().split())}_user{user}_item{item}.png")
    plt.show()

In [None]:
for user, item in node_pairs:
    print(user, item)
    plot_embeds(user, item, preds, nodes, times, embed_pca, "Original model with reduced dimesnsions")
    plot_embeds(user, item, preds2, nodes2, times2, embed_pca2, "Model with 2D node embedding")
    #plot_embeds(user, item, preds3, nodes3, times3, embed_pca3)