# Exploring the `Yelp` Dataset


In [None]:
%matplotlib inline

from pathlib import Path
from typing import Tuple

import numpy as np
from scipy import sparse as sp
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import check_consistent_length


CORE = 10
SEED = 0
DATA_DIR = (
    Path()
    .cwd()
    .parent.joinpath("data", "processed", "yelp", f"core_{CORE}_seed_{SEED}")
)
assert DATA_DIR.is_dir()


## Statistics

Edges between users are undirected, and `edge_uu.txt` only stores the indices of the upper triangular part of the adjacency matrix.


In [None]:
# `ratings`: a 2d `numpy.ndarray` object.
# Each row is a `[uid, iid, label]` triplet.
ratings = np.unique(
    np.loadtxt(DATA_DIR.joinpath("ratings.txt"), dtype=np.int64), axis=0
)
# `triplets_kg`: a 2d `numpy.ndarray` object.
# Each row is a `[eid_h, rid, eid_t]` triplet.
triplets_kg = np.unique(
    np.loadtxt(DATA_DIR.joinpath("triplets_kg.txt"), dtype=np.int64), axis=0
)
# `edges_user`: a 2d `numpy.ndarray` object.
# Each row is an unordered `[uid_u, uid_v]` pair.
edges_user = np.unique(
    np.loadtxt(DATA_DIR.joinpath("edges_uu.txt"), dtype=np.int64), axis=0
)
assert ratings.ndim == 2 and ratings.shape[1] == 3
assert triplets_kg.ndim == 2 and triplets_kg.shape[1] == 3
assert edges_user.ndim == 2 and edges_user.shape[1] == 2
# indices of the upper triangular part of the adjacency matrix
assert np.all(edges_user[:, 0] < edges_user[:, 1])
print(
    "\n".join(
        [
            f"num_ratings = {ratings.shape[0]}",
            f"num_triplets = {triplets_kg.shape[0]}",
            f"num_edges_user = {edges_user.shape[0]}",
        ]
    )
)


In [None]:
num_users = ratings[:, 0].max() + 1
num_items = ratings[:, 1].max() + 1
num_entities = triplets_kg[:, [0, 2]].max() + 1
num_relations = triplets_kg[:, 1].max() + 1
assert num_items < num_entities
assert edges_user.max() < num_users
sparsity_ui = ratings.shape[0] / num_users / num_items
sparsity_uu = edges_user.shape[0] * 2 / num_users / (num_users - 1)
print(
    "\n".join(
        [
            f"num_users = {num_users}",
            f"num_items = {num_items}",
            f"num_entities = {num_entities}",
            f"num_relations = {num_relations}",
            f"sparsity_ui = {sparsity_ui}",
            f"sparsity_uu = {sparsity_uu}",
        ]
    )
)


## User-Item Interaction Matrix


In [None]:
# encodes user history to a vector
# `user_history` is a `nnumpy.ndarray` object of shape `[num_users, num_items]`
# For each positive sample `(uid, iid)`, `user_history[uid, iid] = 1`.
ratings_pos = ratings[ratings[:, 2] == 1]
user_history = sp.csr_matrix(
    ([1.0] * ratings_pos.shape[0], (ratings_pos[:, 0], ratings_pos[:, 1])),
    shape=(num_users, num_items),
    dtype=np.float32,
)
user_history.nnz


In [None]:
deg_u = user_history.sum(axis=1).A.flatten()
deg_i = user_history.sum(axis=0).A.flatten()
print(
    "\n".join(
        [
            f"deg_u: mean = {np.mean(deg_u)}, std = {np.std(deg_u)}",
            f"deg_i: mean = {np.mean(deg_i)}, std = {np.std(deg_i)}, ",
        ]
    )
)


In [None]:
np.unique(deg_u, return_counts=True), np.unique(deg_i, return_counts=True)


## Similarity between Users Connected by Social Edges


### Number of Common Neighbors & Jaccard Measure


In [None]:
def common_neighbors_jaccard(
    y_true: sp.spmatrix, y_pred: sp.spmatrix
) -> Tuple[np.ndarray, np.ndarray]:
    assert y_true.ndim == 2 and y_pred.ndim == 2
    check_consistent_length(y_true, y_pred)
    y_true = y_true.astype(np.bool_).astype(np.int8)
    y_pred = y_pred.astype(np.bool_).astype(np.int8)
    union = y_true.multiply(y_pred)
    intersection = (y_true + y_pred).astype(np.bool_).astype(np.int8)
    num_union = union.sum(axis=1).A.astype(np.float32)
    num_intersection = intersection.sum(axis=1).A.astype(np.float32)
    return num_union, num_union / num_intersection


In [None]:
# `common_nbrs_pos`: the number of common neighbors between users
# connected by edges
# `jaccard_pos`: the jaccard measure between users connected by edges
common_nbrs_pos, jaccard_pos = common_neighbors_jaccard(
    user_history[edges_user[:, 0], :], user_history[edges_user[:, 1], :]
)
print(
    "\n".join(
        [
            f"common_nbrs_pos: mean = {np.mean(common_nbrs_pos)}, "
            f"std = {np.std(common_nbrs_pos)}, "
            f"median = {np.median(common_nbrs_pos)}",
            f"jaccard_pos: mean = {np.mean(jaccard_pos)}, "
            f"std = {np.std(jaccard_pos)}, "
            f"median = {np.median(jaccard_pos)}",
        ]
    )
)


In [None]:
# In the Yelp dataset, edges are undirected.
# The number of possible edges is N = `(num_users - 1) * num_users / 2``
def encode_indices_batch(rows: np.ndarray, cols: np.ndarray) -> np.ndarray:
    # converts a `(row, col)` pair to [0, N - 1]
    assert np.all(rows < cols)
    return rows + cols * (cols - 1) // 2


def decode_indices_batch(
    indices: np.ndarray, size: int
) -> Tuple[np.ndarray, np.ndarray]:
    # converts an integer in the range [0, N - 1] to a `(row, col)` pair
    bins = np.cumsum(np.arange(size))
    cols = np.digitize(indices, bins, right=False)
    rows = indices - cols * (cols - 1) // 2
    return rows, cols


In [None]:
indices_pos = encode_indices_batch(edges_user[:, 0], edges_user[:, 1])
assert np.unique(indices_pos).size == indices_pos.size
population = (num_users) * (num_users - 1) // 2

# samples a set of negative edges to compute the number of common neighbors
# and the jaccard measure of users that are not connected since the population
# is too large
num_neg = indices_pos.size
num_samples = int(1.2 * num_neg)

max_trial = 3
num_runs = 100

common_nbrs_neg_per_run = []
jaccard_neg_per_run = []
for _ in range(num_runs):
    # sample negative edges
    indices_neg = None
    for _ in range(max_trial):
        samples = np.unique(
            np.random.randint(population, size=num_samples, dtype=np.int64)
        )
        mask = np.isin(samples, indices_pos, invert=True)
        if indices_neg is not None:
            mask = np.logical_and(
                mask, np.isin(samples, indices_neg, invert=True)
            )
        samples = samples[mask]
        indices_neg = (
            samples
            if indices_neg is None
            else np.hstack([indices_neg, samples])
        )
        if indices_neg.size >= num_neg:
            indices_neg = indices_neg[:num_neg]
            break
    assert indices_neg.size == num_neg
    assert np.unique(indices_neg).size == indices_neg.size
    assert np.all(np.isin(indices_neg, indices_pos, invert=True))

    rows, cols = decode_indices_batch(indices_neg, size=num_users)
    assert np.all(rows < cols)

    # `common_nbrs_neg`: the number of common neighbors between users
    # that are not connected
    # `jaccard_neg`: the jaccard measure between users that are not connected
    common_nbrs_neg, jaccard_neg = common_neighbors_jaccard(
        user_history[rows, :], user_history[cols, :]
    )
    common_nbrs_neg_per_run.append(np.mean(common_nbrs_neg))
    jaccard_neg_per_run.append(np.mean(jaccard_neg))

print(
    "\n".join(
        [
            f"common_nbrs_neg: mean = {np.mean(common_nbrs_neg_per_run)}, "
            f"std = {np.std(common_nbrs_neg_per_run)}",
            f"jaccard_neg: mean = {np.mean(jaccard_neg_per_run)}, "
            f"std = {np.std(jaccard_neg_per_run)}",
        ]
    )
)
