In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict

: 

In [3]:
# Step 1: Load the user-movie ratings data from the npy file
data = np.load('../../temp_data/user_movie_rating.npy')
data_array = data.astype(int)

# Extract user, movie, and rating data from the loaded records
user_ids, movie_ids, ratings = data[:, 0], data[:, 1], data[:, 2]

# Create a CSR 
user_movie_matrix = csr_matrix((ratings, (user_ids, movie_ids)))



# Load the user-movie ratings data from the npz file
num_users = user_movie_matrix.shape[0]
num_movies = user_movie_matrix.shape[1]

In [10]:
def minhash(matrix, num_permutations):
    num_users = matrix.shape[0]
    num_movies = matrix.shape[1]

    # Initialize the signature matrix with infinity
    signature_matrix = np.full((num_permutations, num_users), np.inf)

    # For each permutation
    for i in range(num_permutations):
        # Generate a random permutation
        permutation = np.random.permutation(num_movies)

        # For each user
        for user in range(num_users):
            # Get the movies rated by this user
            movies = matrix[user, :].nonzero()[0]

            # If the user hasn't rated any movies, skip this user
            if movies.size == 0:
                continue

            # Apply the permutation to the movies
            permuted_movies = permutation[movies]

            # Take the index of the first non-zero entry as the hash value
            hash_value = np.min(permuted_movies)

            # Update the signature matrix
            signature_matrix[i, user] = hash_value

    return signature_matrix

In [11]:
def banding(signature_matrix, num_bands):
    num_hash_functions, num_users = signature_matrix.shape
    rows_per_band = num_hash_functions // num_bands

    # Initialize the buckets
    buckets = [{} for _ in range(num_bands)]

    # For each band
    for band in range(num_bands):
        # Get the rows for this band
        rows = signature_matrix[band * rows_per_band : (band + 1) * rows_per_band]

        # For each user
        for user in range(num_users):
            # Get the signature for this user and band
            signature = rows[:, user]

            # Hash the signature into a bucket
            bucket = hash(tuple(signature))

            # Add the user to the bucket
            if bucket not in buckets[band]:
                buckets[band][bucket] = []
            buckets[band][bucket].append(user)

    return buckets

In [None]:
from itertools import combinations

# Function to calculate candidate pairs from buckets
def calculate_pairs_from_buckets(buckets):
    # Create a set to store the candidate pairs
    candidate_pairs = set()

    # For each band of buckets
    for band in buckets:
        # For each bucket in the band
        for bucket in band.values():
            # If the bucket contains more than one user
            if len(bucket) > 1:
                # Add all pairs of users in this bucket to the candidate pairs
                candidate_pairs.update(combinations(bucket, 2))

    # Return the candidate pairs
    return candidate_pairs

# Call the function with your buckets
candidate_pairs = calculate_pairs_from_buckets(buckets)

# Print the candidate pairs
print(candidate_pairs)

In [7]:
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

In [13]:
num_hash_functions = 100
signature_matrix = minhash(user_movie_matrix, num_hash_functions)

In [29]:
num_bands = 20
buckets = banding(signature_matrix, num_bands)

In [None]:
similarities = [(pair, jaccard_similarity(set(pair[0]), set(pair[1]))) for pair in candidate_pairs]

threshold = 0.5
similar_pairs = [(pair, similarity) for pair, similarity in similarities if similarity > threshold]