# Libraries

In [None]:
# Install required packages (run these cells once; comment out after installation)
!pip install torch_geometric --quiet
!pip install scikit-learn pandas scipy scikit-network --quiet
!pip install scikit-network --quiet

# Imports
import torch
import time
import psutil
import os
import gc
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

# scikit-learn
from sklearn.semi_supervised import LabelPropagation as SKLabelPropagation

# scikit-network
from sknetwork.classification import Propagation as SKNPropagation

# PyTorch Geometric
from torch_geometric.nn import LabelPropagation as PyGLabelPropagation
from torch_geometric.data import Data

# !pip install torch==2.0.0+cpu torchvision==0.15.1+cpu torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cpu > /dev/null
# !pip install  dgl==1.1.0 -f https://data.dgl.ai/wheels/repo.html > /dev/null
# # !pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html
# os.environ['TORCH'] = torch.__version__
# os.environ['DGLBACKEND'] = "pytorch"
# import dgl
# import dgl.function as fn

# For dataset loading via torch_geometric
from torch_geometric.datasets import Flickr, AmazonProducts, Yelp, Taobao

# Set device: use GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cuda


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
print(torch.__version__)

2.5.1+cu124


# Memory Management

In [None]:
def get_max_nodes_dynamic(feature_dim, dense=False, safety_factor=0.8):
    """
    Estimate the maximum number of nodes that can be processed.
    For a sparse graph, each node costs ~ feature_dim*4 bytes.
    For a dense graph (all pairwise edges), memory ~ N^2 * 4 bytes.
    """
    if device.type == 'cuda':
        free_mem, _ = torch.cuda.mem_get_info()  # in bytes
        free_mem = int(free_mem * safety_factor)
    else:
        free_mem = int(psutil.virtual_memory().available * safety_factor)
    if dense:
        # For a dense similarity matrix (float32), need ~ N^2 * 4 bytes.
        max_nodes = int((free_mem / 4) ** 0.5)
    else:
        bytes_per_node = feature_dim * 4
        max_nodes = free_mem // bytes_per_node
    return max_nodes

STATIC_LIMITS = {
    'Flickr': 100000,
    'AmazonProducts': 50000,
    'Yelp': 100000,
    'Taobao': 50000
}

def get_max_nodes_static(dataset_name):
    """Return preset maximum nodes for a given dataset."""
    return STATIC_LIMITS.get(dataset_name, None)


# VLP

In [None]:
def transform_node_vectors(V):
    """
    Transform node vectors so that for each node vector v_i,
    the inner product equals normalized cosine similarity.
    Achieved by: v_i_new = [v_i/||v_i||, 1] / sqrt(2)
    """
    norms = torch.norm(V, p=2, dim=1, keepdim=True) + 1e-10
    V_norm = V / norms
    ones_col = torch.ones(V_norm.size(0), 1, device=V_norm.device)
    V_aug = torch.cat((V_norm, ones_col), dim=1)
    V_transformed = V_aug / torch.sqrt(torch.tensor(2.0, device=V_aug.device))
    return V_transformed

def compute_degrees(V):
    """
    Given transformed node vectors V (N x (d+1)), compute degrees:
      deg_i = (v_i^T (sum_j v_j)) - (v_i^T v_i)
    """
    row_sums = torch.sum(V, dim=0)
    degrees = torch.matmul(V, row_sums) - torch.sum(V * V, dim=1)
    return degrees

def lp_one_iter(V, Y, degrees):
    """
    One iteration of VLP.
    V: transformed node vectors (N x (d+1))
    Y: label probability matrix (N x K)
    degrees: precomputed degree vector (N,)

    Returns:
      Y_next = (1/deg_i) * (V (V^T Y) - (v_i^T v_i)*Y)
    """
    self_loop = torch.sum(V * V, dim=1)  # shape: (N,)
    A_Y = torch.matmul(V, torch.matmul(V.T, Y))
    Y_update = A_Y - self_loop.unsqueeze(1) * Y
    inv_deg = 1.0 / (degrees + 1e-10)
    Y_next = inv_deg.unsqueeze(1) * Y_update
    return Y_next

def lp(V, Y_init, num_iter):
    """
    Run VLP for num_iter iterations.
    """
    degrees = compute_degrees(V)
    Y = Y_init
    for t in range(num_iter):
        Y = lp_one_iter(V, Y, degrees)
    return Y

def vlp_run(features, num_iter, num_labels):
    """
    Run our VLP approach for 'num_iter' label propagation steps.
    """
    n_nodes = features.size(0)
    # Transform node vectors
    V = transform_node_vectors(features)
    deg = compute_degrees(V)
    # Assign random labels (one-hot)
    Y_init = assign_initial_labels(n_nodes, num_labels)

    start_t = time.time()
    Y = Y_init.clone()
    for _ in range(num_iter):
        Y = lp_one_iter(V, Y, deg)
    elapsed = time.time() - start_t

    return elapsed

# Other methods

In [None]:
def can_build_dense_graph(num_nodes):
    """
    Estimate if building an n x n adjacency matrix (float32) is feasible.
    We'll do a rough check: n^2 * 4 bytes, plus overhead. If it exceeds
    80% of available memory, we say it's not feasible.
    """
    free_mem = psutil.virtual_memory().available
    needed = num_nodes**2 * 4
    # We'll require that needed < (0.8 * free_mem)
    return needed < 0.8 * free_mem

In [None]:
def my_kernel(X, Y):
    X_norm = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-10)
    Y_norm = Y / (np.linalg.norm(Y, axis=1, keepdims=True) + 1e-10)
    sim = np.dot(X_norm, Y_norm.T)
    return (sim + 1.0) / 2.0

In [None]:
###############################################################################
# scikit-learn Label Propagation using a Custom Kernel
def sklearn_lp_run(features, num_iter):
    """
    Run scikit-learn's LabelPropagation using a custom kernel function.
    If the full dense matrix is infeasible, return -1.
    """
    n_nodes = features.size(0)
    if not can_build_dense_graph(n_nodes):
        return -1

    # Use the original features on CPU
    feats_cpu = features.detach().cpu().numpy().astype(np.float32)
    # Use our custom kernel to compute the dense similarity matrix internally.
    # LabelPropagation will call my_kernel(feats_cpu, feats_cpu)
    from sklearn.semi_supervised import LabelPropagation
    start_t = time.time()
    lp_model = SKLabelPropagation(kernel=my_kernel, max_iter=num_iter)
    # Create label array: label first 100, rest -1.
    # labels = -np.ones(n_nodes, dtype=int)
    labels = np.random.randint(0, 50, size=n_nodes)
    lp_model.fit(feats_cpu, labels)
    elapsed = time.time() - start_t
    return elapsed

###############################################################################
# scikit-network Label Propagation
def skn_lp_run(features, num_iter):
    """
    Run scikit-network's Propagation on a dense adjacency matrix.
    If the full dense graph is infeasible, return -1.
    """
    n_nodes = features.size(0)
    if not can_build_dense_graph(n_nodes):
        return -1

    feats_cpu = features.detach().cpu().numpy().astype(np.float32)
    norms = np.linalg.norm(feats_cpu, axis=1, keepdims=True) + 1e-10
    feats_norm = feats_cpu / norms
    sim = feats_norm @ feats_norm.T
    A = (sim + 1.0) / 2.0
    np.fill_diagonal(A, 0.0)
    # Build a sparse adjacency matrix from A
    row_idx, col_idx = np.nonzero(A)
    vals = A[row_idx, col_idx]
    adjacency = sp.csr_matrix((vals, (row_idx, col_idx)), shape=(n_nodes, n_nodes))

    from sknetwork.classification import Propagation
    label_dict = {i: int(np.random.randint(0, 50)) for i in range(n_nodes)}
    start_t = time.time()
    prop = SKNPropagation(n_iter=num_iter, weighted=True)
    prop.fit_predict(adjacency, labels=label_dict)
    elapsed = time.time() - start_t
    del adjacency, A, sim
    gc.collect()
    return elapsed

###############################################################################
# PyTorch Geometric Label Propagation
def pyg_lp_run(features, num_iter):
    """
    Run PyTorch Geometric's LabelPropagation on a dense graph.
    If constructing a full dense graph is infeasible, return -1.
    """
    n_nodes = features.size(0)
    if n_nodes > 11000:
      return -1

    # Compute the full dense similarity matrix on CPU
    feats_cpu = features.detach().cpu().numpy().astype(np.float32)
    norms = np.linalg.norm(feats_cpu, axis=1, keepdims=True) + 1e-10
    feats_norm = feats_cpu / norms
    sim = feats_norm @ feats_norm.T
    sim = (sim + 1.0) / 2.0
    np.fill_diagonal(sim, 0.0)
    row_idx, col_idx = np.nonzero(sim)
    edge_src = torch.tensor(row_idx, device=device, dtype=torch.long)
    edge_dst = torch.tensor(col_idx, device=device, dtype=torch.long)
    edge_index = torch.stack([edge_src, edge_dst], dim=0)

    # Fix: Instead of negative labels, assign a random valid label (0 to 4) for every node.
    label_arr = torch.randint(0, 50, (n_nodes,), device=device, dtype=torch.long)

    from torch_geometric.nn import LabelPropagation
    start_t = time.time()
    lp = LabelPropagation(num_layers=num_iter, alpha=0.8)
    lp(label_arr, edge_index)
    elapsed = time.time() - start_t

    del sim, row_idx, col_idx, edge_src, edge_dst, edge_index
    gc.collect()
    return elapsed


###############################################################################
# DGL Label Propagation
def dgl_lp_run(features, num_iter=5):
    """
    Run label propagation using DGL on the full dense graph.
    The function computes the full dense similarity matrix on CPU,
    converts it to a SciPy CSR matrix, then builds a DGL graph via dgl.from_scipy().
    We remove any existing self–loops and then re–add self–loops as per recommended DGL practice.
    Finally, we try to move the graph to GPU. If that fails, we remain on CPU.
    If the number of nodes exceeds 50,000 or building the dense graph is infeasible, returns -1.
    """
    n_nodes = features.size(0)
    if n_nodes > 50000 or not can_build_dense_graph(n_nodes):
        return -1

    # Compute full dense similarity on CPU using normalized cosine similarity
    feats_cpu = features.detach().cpu().numpy().astype(np.float32)
    norms = np.linalg.norm(feats_cpu, axis=1, keepdims=True) + 1e-10
    feats_norm = feats_cpu / norms
    sim = feats_norm @ feats_norm.T
    sim = (sim + 1.0) / 2.0
    np.fill_diagonal(sim, 0.0)

    # Convert the dense similarity matrix to a sparse CSR matrix
    row_idx, col_idx = np.nonzero(sim)
    vals = sim[row_idx, col_idx]
    adjacency = sp.csr_matrix((vals, (row_idx, col_idx)), shape=(n_nodes, n_nodes))

    # Build a DGL graph from the sparse adjacency
    g = dgl.from_scipy(adjacency)

    # Remove any preexisting self–loops and then add self–loops
    g = dgl.remove_self_loop(g)
    g = dgl.add_self_loop(g)

    # Attempt to move the graph to GPU; if it fails, fallback to CPU.
    try:
        g = g.to(device)
    except Exception as e:
        print("Failed to move DGL graph to GPU, using CPU instead:", e)
        g = g.to(torch.device("cpu"))

    # Initialize node labels (one-hot) on the same device as g.
    Y_init = assign_initial_labels(n_nodes, 5).to(g.device)
    g.ndata['h'] = Y_init.clone()

    start_t = time.time()
    for _ in range(num_iter):
        # Use the standard update_all pattern with a temporary field.
        g.update_all(fn.copy_u('h', 'm'), fn.mean('m', 'h_new'))
        g.ndata['h'] = g.ndata.pop('h_new')
    elapsed = time.time() - start_t

    del sim, row_idx, col_idx, adjacency
    gc.collect()
    return elapsed


# Graph Generation

In [None]:
def construct_similarity_graph(features, top_k=None):
    """
    Construct a graph from node features.
    If top_k is provided, compute a k-NN graph (sparse);
    if top_k is None, compute the full dense graph (all pairwise edges).

    Returns:
      edge_index_src, edge_index_dst as torch tensors on device.
    """
    N, dim = features.shape
    features_norm = torch.nn.functional.normalize(features, p=2, dim=1)
    if top_k is None:
        # Full dense graph: compute dense similarity matrix, set diagonal to 0,
        # and then extract indices for all nonzero entries.
        with torch.no_grad():
            S = torch.matmul(features_norm, features_norm.T)
        S = (S + 1) / 2  # normalize between 0 and 1
        S.fill_diagonal_(0)
        edge_indices = torch.nonzero(S, as_tuple=False)
        edge_index_src = edge_indices[:, 0]
        edge_index_dst = edge_indices[:, 1]
        return edge_index_src.to(device), edge_index_dst.to(device)
    else:
        # Sparse k-NN graph (as before)
        all_src = []
        all_dst = []
        if device.type == 'cuda':
            free_mem = torch.cuda.mem_get_info()[0]
        else:
            free_mem = psutil.virtual_memory().available
        target_bytes = int(free_mem * 0.25)
        max_M = max(1, target_bytes // (N * 4))
        batch_size = min(N, max_M)
        if batch_size < N:
            print(f"Using batch size {batch_size} for similarity computation to fit memory.")
        for i in range(0, N, batch_size):
            end = min(i + batch_size, N)
            batch_features = features_norm[i:end]
            sim_matrix = batch_features @ features_norm.T
            k = min(top_k + 1, N)
            sims, idxs = torch.topk(sim_matrix, k=k, dim=1)
            idxs = idxs.cpu().numpy()
            for j, node_idx in enumerate(range(i, end)):
                neigh_indices = [nid for nid in idxs[j] if nid != node_idx][:top_k]
                all_src.extend([node_idx] * len(neigh_indices))
                all_dst.extend(neigh_indices)
        edge_index_src = torch.tensor(all_src, device=device, dtype=torch.long)
        edge_index_dst = torch.tensor(all_dst, device=device, dtype=torch.long)
        return edge_index_src, edge_index_dst


# Load Dataset

In [None]:
def load_flickr_dataset():
    # Example using Torch Geometric's Flickr dataset (dummy version)
    from torch_geometric.datasets import Flickr
    data = Flickr(root='data/Flickr')
    return data.data.x  # Node features

def load_amazon_dataset():
    from torch_geometric.datasets import AmazonProducts
    data = AmazonProducts(root='data/AmazonProducts')
    return data.data.x

def load_yelp_dataset():
    from torch_geometric.datasets import Yelp
    data = Yelp(root='data/Yelp')
    return data.data.x

def load_taobao_dataset():
    # Load Taobao dataset using Torch Geometric
    from torch_geometric.datasets import Taobao
    data = Taobao(root='data/Taobao')
    # Get edge indices for user-item and item-category
    u_t_i = data.data['user', 'to', 'item']['edge_index']
    i_t_c = data.data['item', 'to', 'category']['edge_index']
    # Join relationships: create a user-category linkage via items.
    def join_relationships_count(a_to_i, i_to_c):
        a_ids = a_to_i[0].tolist()
        inter_ids = a_to_i[1].tolist()
        inter_from_c = i_to_c[0].tolist()
        c_ids = i_to_c[1].tolist()
        mapping = {}
        for inter, cid in zip(inter_from_c, c_ids):
            if inter not in mapping:
                mapping[inter] = []
            mapping[inter].append(cid)
        joined = []
        for a, inter in zip(a_ids, inter_ids):
            if inter in mapping:
                for cid in mapping[inter]:
                    joined.append((a, cid))
        df = pd.DataFrame(joined, columns=['user', 'category'])
        count_df = df.groupby(['user', 'category']).size().reset_index(name='count')
        return count_df

    taobao_df = join_relationships_count(u_t_i, i_t_c)
    def filter_by_threshold(df, threshold_percentage=15, col=1):
        col_counts = df.iloc[:, col].value_counts()
        max_count = col_counts.max()
        threshold = max_count * (threshold_percentage / 100.0)
        filtered = df[df.iloc[:, col].isin(col_counts[col_counts >= threshold].index)]
        return filtered
    filtered_df = filter_by_threshold(taobao_df, threshold_percentage=15)
    def df_to_matrix(df):
        c1, c2, c3 = df.columns.tolist()
        pivot_df = df.pivot(index=c1, columns=c2, values=c3)
        pivot_df.fillna(0, inplace=True)
        return pivot_df
    def convert_to_tfidf(pivot_df):
        from sklearn.feature_extraction.text import TfidfTransformer
        tfidf_transformer = TfidfTransformer()
        tfidf_matrix = tfidf_transformer.fit_transform(pivot_df)
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=pivot_df.index, columns=pivot_df.columns)
        return tfidf_df
    pivot_df = df_to_matrix(filtered_df)
    tfidf_df = convert_to_tfidf(pivot_df)
    # Convert to torch tensor
    tfidf_matrix = torch.tensor(tfidf_df.values, dtype=torch.float32, device=device)
    return tfidf_matrix

# Initial Label Assignment

In [None]:
def assign_initial_labels(num_nodes, num_labels, seed=42):
    """
    Randomly assign each node one of num_labels; return a one-hot matrix (num_nodes, num_labels).
    """
    np.random.seed(seed)
    rand_labels = np.random.randint(0, num_labels, size=num_nodes)
    Y_init = np.zeros((num_nodes, num_labels), dtype=np.float32)
    for i, lbl in enumerate(rand_labels):
        Y_init[i, lbl] = 1.0
    return torch.tensor(Y_init, device=device)


# Experiment function

In [None]:
def run_lp_experiment(dataset_name, thresh_list, num_labels=5, num_iter=1, n_runs=5):
    results = []

    # Load dataset features based on dataset_name
    if dataset_name == "Flickr":
        full_features = load_flickr_dataset()
    elif dataset_name == "AmazonProducts":
        full_features = load_amazon_dataset()
    elif dataset_name == "Yelp":
        full_features = load_yelp_dataset()
    elif dataset_name == "Taobao":
        full_features = load_taobao_dataset()
    else:
        raise ValueError("Unknown dataset name.")

    full_features = full_features.to(device)
    total_nodes = full_features.size(0)
    feat_dim = full_features.size(1)
    print(f"{dataset_name}: total nodes = {total_nodes}, feature dim = {feat_dim}")

    num_nodes_cap = total_nodes  # use all nodes as cap

    for thresh in thresh_list:
        num_nodes = int(num_nodes_cap * thresh)
        if num_nodes < 1:
            continue
        print(f"\nProcessing {dataset_name} with threshold {thresh} ({num_nodes} nodes)")
        sub_features = full_features[:num_nodes]

        # Method 1: VLP (vector-based LP, always runs)
        torch.cuda.empty_cache()
        gc.collect()
        vlp_times = []
        for _ in range(n_runs):
            start_time = time.time()
            _ = vlp_run(sub_features, num_iter=num_iter, num_labels=num_labels)
            vlp_times.append(time.time() - start_time)
        time_vlp = np.mean(vlp_times)

        # For the other methods, if num_nodes exceeds 50,000, record -1
        non_vlp_time = {}
        if num_nodes > 20000:
            non_vlp_time['sklearn'] = -1
            non_vlp_time['skn'] = -1
            non_vlp_time['pyg'] = -1
            # non_vlp_time['dgl'] = -1
        else:
            sklearn_times = []
            skn_times = []
            pyg_times = []
            dgl_times = []
            for _ in range(n_runs):
                torch.cuda.empty_cache()
                gc.collect()
                start_time = time.time()
                _ = sklearn_lp_run(sub_features, num_iter=num_iter)
                sklearn_times.append(time.time() - start_time)

                torch.cuda.empty_cache()
                gc.collect()
                start_time = time.time()
                _ = skn_lp_run(sub_features, num_iter=num_iter)
                skn_times.append(time.time() - start_time)

                torch.cuda.empty_cache()
                gc.collect()
                start_time = time.time()
                _ = pyg_lp_run(sub_features, num_iter=num_iter)
                pyg_times.append(time.time() - start_time)

                # torch.cuda.empty_cache()
                # gc.collect()
                # start_time = time.time()
                # _ = dgl_lp_run(sub_features, num_iter=num_iter)
                # dgl_times.append(time.time() - start_time)

            non_vlp_time['sklearn'] = np.mean(sklearn_times)
            non_vlp_time['skn'] = np.mean(skn_times)
            non_vlp_time['pyg'] = np.mean(pyg_times)
            # non_vlp_time['dgl'] = np.mean(dgl_times)

        results.append({
            "Dataset": dataset_name,
            "Threshold": thresh,
            "NodesUsed": num_nodes,
            "Time_VLP": round(time_vlp, 4),
            "Time_sklearn": round(non_vlp_time['sklearn'], 4) if non_vlp_time['sklearn'] > 0 else non_vlp_time['sklearn'],
            "Time_skn": round(non_vlp_time['skn'], 4) if non_vlp_time['skn'] > 0 else non_vlp_time['skn'],
            "Time_pyg": round(non_vlp_time['pyg'], 4) if non_vlp_time['pyg'] > 0 else non_vlp_time['pyg'],
            # "Time_dgl": round(non_vlp_time['dgl'], 4) if non_vlp_time['dgl'] > 0 else non_vlp_time['dgl']
        })

        del sub_features
        torch.cuda.empty_cache()
        gc.collect()

    df = pd.DataFrame(results)
    df.to_csv(f"{dataset_name}_lp_experiment_results.csv", index=False)
    return df


#Run Experiments for All Datasets

In [None]:
# Define threshold fractions (as fraction of the maximum nodes available)
# threshold_list = [0.05, 0.1, 0.15, 0.2]
num_labels = 50   # Number of initial labels/classes
num_iter = 100     # Number of LP iterations (set to 1; adjust if needed)

In [None]:
print("Running experiments on Flickr")
results_flickr = run_lp_experiment("Flickr", [0.05, 0.1, .15, .2, 0.5, 1], num_labels, num_iter)
print(results_flickr)

Running experiments on Flickr




Flickr: total nodes = 89250, feature dim = 500

Processing Flickr with threshold 0.05 (4462 nodes)

Processing Flickr with threshold 0.1 (8925 nodes)

Processing Flickr with threshold 0.15 (13387 nodes)

Processing Flickr with threshold 0.2 (17850 nodes)

Processing Flickr with threshold 0.5 (44625 nodes)

Processing Flickr with threshold 1 (89250 nodes)
  Dataset  Threshold  NodesUsed  Time_VLP  Time_sklearn  Time_skn  Time_pyg
0  Flickr       0.05       4462    0.1273        0.2012    1.2401    0.7766
1  Flickr       0.10       8925    0.0199        0.6553    4.2275    7.6131
2  Flickr       0.15      13387    0.0279        1.3853    9.2052    0.0000
3  Flickr       0.20      17850    0.0387        2.6493   17.1691    0.0000
4  Flickr       0.50      44625    0.0819       -1.0000   -1.0000   -1.0000
5  Flickr       1.00      89250    0.1377       -1.0000   -1.0000   -1.0000


In [None]:
print("Running experiments on AmazonProducts")
results_amazon = run_lp_experiment("AmazonProducts", [0.002, 0.004, 0.006, 0.008, 0.01,.5,1], num_labels, num_iter)
print(results_amazon)

Running experiments on AmazonProducts


Downloading https://drive.usercontent.google.com/download?id=17qhNA8H1IpbkkR-T2BmPQm8QNW5do-aa&confirm=t
Downloading https://drive.usercontent.google.com/download?id=10SW8lCvAj-kb6ckkfTOC5y0l8XXdtMxj&confirm=t
Downloading https://drive.usercontent.google.com/download?id=1LIl4kimLfftj4-7NmValuWyCQE8AaE7P&confirm=t
Downloading https://drive.usercontent.google.com/download?id=1npK9xlmbnjNkV80hK2Q68wTEVOFjnt4K&confirm=t
Processing...
Done!


AmazonProducts: total nodes = 1569960, feature dim = 200

Processing AmazonProducts with threshold 0.002 (3139 nodes)

Processing AmazonProducts with threshold 0.004 (6279 nodes)

Processing AmazonProducts with threshold 0.006 (9419 nodes)

Processing AmazonProducts with threshold 0.008 (12559 nodes)

Processing AmazonProducts with threshold 0.01 (15699 nodes)

Processing AmazonProducts with threshold 0.5 (784980 nodes)

Processing AmazonProducts with threshold 1 (1569960 nodes)
          Dataset  Threshold  NodesUsed  Time_VLP  Time_sklearn  Time_skn  \
0  AmazonProducts      0.002       3139    0.0181        0.0749    0.7098   
1  AmazonProducts      0.004       6279    0.0191        0.2580    2.2355   
2  AmazonProducts      0.006       9419    0.0219        0.6032    5.0631   
3  AmazonProducts      0.008      12559    0.0226        1.1324    9.2570   
4  AmazonProducts      0.010      15699    0.0254        1.8379   13.4113   
5  AmazonProducts      0.500     784980    0.6848     

In [None]:
print("Running experiments on Yelp")
results_yelp = run_lp_experiment("Yelp", [0.004, 0.008, 0.012, 0.016, 0.02, .5, 1], num_labels, num_iter)
print(results_yelp)

Running experiments on Yelp




Yelp: total nodes = 716847, feature dim = 300

Processing Yelp with threshold 0.004 (2867 nodes)

Processing Yelp with threshold 0.008 (5734 nodes)

Processing Yelp with threshold 0.012 (8602 nodes)

Processing Yelp with threshold 0.016 (11469 nodes)

Processing Yelp with threshold 0.02 (14336 nodes)

Processing Yelp with threshold 0.5 (358423 nodes)

Processing Yelp with threshold 1 (716847 nodes)
  Dataset  Threshold  NodesUsed  Time_VLP  Time_sklearn  Time_skn  Time_pyg
0    Yelp      0.004       2867    0.0590        0.0797    0.6447    0.4634
1    Yelp      0.008       5734    0.0172        0.2879    1.8705    1.7989
2    Yelp      0.012       8602    0.0190        0.5437    3.8604    7.0309
3    Yelp      0.016      11469    0.0195        0.9487    6.7375    0.0000
4    Yelp      0.020      14336    0.0252        1.4439   11.4248    0.0000
5    Yelp      0.500     358423    0.3960       -1.0000   -1.0000   -1.0000
6    Yelp      1.000     716847    0.7716       -1.0000   -1.0000 

In [None]:
print("Running experiments on Taobao")
results_taobao = run_lp_experiment("Taobao", [0.005, 0.01, 0.015, 0.02, 0.025, 0.03,.5,1], num_labels, num_iter)
print(results_taobao)

Running experiments on Taobao


Downloading https://alicloud-dev.oss-cn-hangzhou.aliyuncs.com/UserBehavior.csv.zip
Extracting data/Taobao/raw/UserBehavior.csv.zip
Processing...
Done!


Taobao: total nodes = 936946, feature dim = 66

Processing Taobao with threshold 0.005 (4684 nodes)

Processing Taobao with threshold 0.01 (9369 nodes)

Processing Taobao with threshold 0.015 (14054 nodes)

Processing Taobao with threshold 0.02 (18738 nodes)

Processing Taobao with threshold 0.025 (23423 nodes)

Processing Taobao with threshold 0.03 (28108 nodes)

Processing Taobao with threshold 0.5 (468473 nodes)

Processing Taobao with threshold 1 (936946 nodes)
  Dataset  Threshold  NodesUsed  Time_VLP  Time_sklearn  Time_skn  Time_pyg
0  Taobao      0.005       4684    0.0173        0.1692    1.2994    1.2526
1  Taobao      0.010       9369    0.0190        0.5787    4.6596   10.8839
2  Taobao      0.015      14054    0.0206        1.2624    9.9202    0.0000
3  Taobao      0.020      18738    0.0233        2.2510   17.4895    0.0000
4  Taobao      0.025      23423    0.0221       -1.0000   -1.0000   -1.0000
5  Taobao      0.030      28108    0.0235       -1.0000   -1.0000   -1.000

In [None]:
all_results = pd.concat([results_flickr, results_amazon, results_yelp, results_taobao], ignore_index=True)
all_results.to_csv("all_lp_experiment_results.csv", index=False)
print("\n*** Final LP Experiment Results ***")
display(all_results)