## Discrete cosine similiarity


In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from datasketch import MinHash
from itertools import combinations

In [2]:
# data load and conversion to csr_matrix format
data = np.load('data/user_movie_rating.npy')
sparse_matrix = csr_matrix(data)

# Save the CSR format sparse matrix to a .npz file
save_npz('csr_sparse_matrix.npz', sparse_matrix)

# Print the CSR format sparse matrix
print("CSR format sparse matrix:")
print(sparse_matrix)


CSR format sparse matrix:
  (0, 0)	1
  (0, 1)	30
  (0, 2)	3
  (1, 0)	1
  (1, 1)	157
  (1, 2)	3
  (2, 0)	1
  (2, 1)	173
  (2, 2)	4
  (3, 0)	1
  (3, 1)	175
  (3, 2)	5
  (4, 0)	1
  (4, 1)	191
  (4, 2)	2
  (5, 0)	1
  (5, 1)	197
  (5, 2)	3
  (6, 0)	1
  (6, 1)	241
  (6, 2)	3
  (7, 0)	1
  (7, 1)	295
  (7, 2)	4
  (8, 0)	1
  :	:
  (65225497, 2)	3
  (65225498, 0)	103703
  (65225498, 1)	17330
  (65225498, 2)	2
  (65225499, 0)	103703
  (65225499, 1)	17346
  (65225499, 2)	4
  (65225500, 0)	103703
  (65225500, 1)	17424
  (65225500, 2)	4
  (65225501, 0)	103703
  (65225501, 1)	17479
  (65225501, 2)	2
  (65225502, 0)	103703
  (65225502, 1)	17621
  (65225502, 2)	4
  (65225503, 0)	103703
  (65225503, 1)	17622
  (65225503, 2)	2
  (65225504, 0)	103703
  (65225504, 1)	17627
  (65225504, 2)	4
  (65225505, 0)	103703
  (65225505, 1)	17764
  (65225505, 2)	4


In [3]:
# create a user_movie matrix

data = np.load('data/user_movie_rating.npy')
data_array = data.astype(int)

# Extract user, movie, and rating data from the loaded records
user_ids, movie_ids, ratings = data[:, 0], data[:, 1], data[:, 2]

# Create a CSR 
user_movie_matrix = csr_matrix((ratings, (user_ids, movie_ids)))


# Load the user-movie ratings data from the npz file
num_users = user_movie_matrix.shape[0]
num_movies = user_movie_matrix.shape[1]



## Minhashing

In [26]:
# user_movie_matrix = csr_matrix((ratings, (user_ids, movie_ids)))

# Number of hash functions for MinHashing
n = 150

# Create an array to store MinHash signatures for each user
minhash_signatures = []

# Function to generate MinHash signatures for a set of movie ratings
def generate_minhash_signature(ratings):
    minhash = MinHash(num_perm=n)
    for movie_id in ratings.nonzero()[1]:
        minhash.update(str(movie_id).encode('utf-8'))
    return minhash

# Generate MinHash signatures for each user
for user_id in range(num_users):
    user_ratings = user_movie_matrix.getrow(user_id)
    minhash_signature = generate_minhash_signature(user_ratings)
    minhash_signatures.append(minhash_signature)




In [111]:
#  LSH with Minhash Signatures

# number of bands (b) and rows per band (r)
b = 10  # number of partitions
r = 15 # ensure that r * b = n

# initialize a dictionary to store buckets
buckets = {}

# hash Minhash signatures into bands
for user_id in range(num_users):
    minhash_signature = minhash_signatures[user_id]
    for band_id in range(b):
        band_start = band_id * r
        band_end = (band_id + 1) * r
        band_signature = minhash_signature.hashvalues[band_start:band_end]
        
        # convert the band signature to a hashable string using hash function
        band_signature_str = str(band_signature)
        
        # add the user to the corresponding bucket
        if band_signature_str not in buckets:
            buckets[band_signature_str] = []
        buckets[band_signature_str].append(user_id)


In [112]:
#pair generation
user_ids, movie_ids = user_movie_matrix.nonzero()

# Initialize a list to store candidate pairs of users
candidate_pairs = []

# Iterate through the buckets created in LSH
for bucket in buckets.values():
    # Generate pairs of users within each bucket
    for i in range(len(bucket)):
        for j in range(i + 1, len(bucket)):
            user1 = bucket[i]
            user2 = bucket[j]

            # Check for common movie ratings
            common_movies = set(movie_ids[user_movie_matrix[user1].nonzero()[1]]) & set(movie_ids[user_movie_matrix[user2].nonzero()[1]])
            
            # Include the pair only if there is at least one common movie
            if common_movies:
                candidate_pairs.append((user1, user2))

# Remove duplicate pairs
candidate_pairs = list(set(candidate_pairs))

In [113]:
#  DCS Calculation and Threshold Check

# define the threshold for DCS
threshold_dcs = 0.73  

# initialize a list to store pairs of users with high DCS
users_dcs= []

# Calculate DCS between two Minhash signatures
def calculate_dcs(minhash_signature1, minhash_signature2):
    # Extract the hash values from the Minhash signatures
    hash_values1 = minhash_signature1.hashvalues
    hash_values2 = minhash_signature2.hashvalues

    # Ensure the hash values have the same length
    min_length = min(len(hash_values1), len(hash_values2))
    hash_values1 = hash_values1[:min_length]
    hash_values2 = hash_values2[:min_length]

    # Check for zero-length vectors
    if min_length == 0:
        return 0.0

    # Calculate the discrete cosine similarity
    dot_product = np.dot(hash_values1, hash_values2)
    norm1 = np.linalg.norm(hash_values1)
    norm2 = np.linalg.norm(hash_values2)

    # Calculate DCS value
    dcs = dot_product / (norm1 * norm2) if (norm1 * norm2) > 0 else 0.0

    return dcs

    
# Iterate through the candidate pairs
for user1, user2 in candidate_pairs:
    # Retrieve the Minhash signatures for user1 and user2
    minhash_signature1 = minhash_signatures[user1]
    minhash_signature2 = minhash_signatures[user2]

    # Calculate DCS between the Minhash signatures
    dcs = calculate_dcs(minhash_signature1, minhash_signature2)

    # Debugging: Print user IDs, DCS value, and Minhash signatures
    # print(f"User IDs: {user1}, {user2}, DCS: {dcs}")
    # print(f"Minhash Signature 1: {minhash_signature1}")
    # print(f"Minhash Signature 2: {minhash_signature2}")

    # Check if DCS exceeds the threshold
    if dcs > threshold_dcs:
        users_dcs.append((user1, user2))

# Print the total number of pairs with DCS > threshold
print(f"Total pairs with DCS > {threshold_dcs}: {len(users_dcs)}")




Total pairs with DCS > 0.73: 36


In [114]:
# # Output for DCS

# # Define the output file name
# output_file = "similar_user_pairs_dcs.txt"

# # write similar user pairs to the output file
# with open(output_file, "w") as file:
#     for user1, user2 in users_dcs:
#         # write the user pair (u1, u2) to the file
#         file.write(f"{user1},{user2}\n")



In [118]:
# Values for b*r corresponding to different hash function totals
br_equivalents = {
    80: (7, 11),
    90: (8, 11),
    100: (9, 7),
    110: (9,12 ),
    120: (10, 11),
    130: (11, 11),
    140: (12, 11),
    150: (13, 11)
}

# Iterate through the specified values of b*r
for total_hash_functions, br_value in br_equivalents.items():
    b, r = br_value
    
    # Ensure r is at least 1
    r = max(1, r)
    
    # Initialize a dictionary to store buckets
    buckets = {}

    # Hash Minhash signatures into bands
    for user_id in range(num_users):
        minhash_signature = minhash_signatures[user_id]
        for band_id in range(b):
            band_start = band_id * r
            band_end = (band_id + 1) * r
            band_signature = minhash_signature.hashvalues[band_start:band_end]
            
            # Convert the band signature to a hashable string using hash function
            band_signature_str = str(band_signature)
            
            # Add the user to the corresponding bucket
            if band_signature_str not in buckets:
                buckets[band_signature_str] = []
            buckets[band_signature_str].append(user_id)

    # Pair generation
    user_ids, movie_ids = user_movie_matrix.nonzero()

    # Initialize a list to store candidate pairs of users
    candidate_pairs = []

    # Iterate through the buckets created in LSH
    for bucket in buckets.values():
        # Generate pairs of users within each bucket
        for i in range(len(bucket)):
            for j in range(i + 1, len(bucket)):
                user1 = bucket[i]
                user2 = bucket[j]

                # Check for common movie ratings
                common_movies = set(movie_ids[user_movie_matrix[user1].nonzero()[1]]) & set(movie_ids[user_movie_matrix[user2].nonzero()[1]])
                
                # Include the pair only if there is at least one common movie
                if common_movies:
                    candidate_pairs.append((user1, user2))

    # Remove duplicate pairs
    candidate_pairs = list(set(candidate_pairs))

    # DCS Calculation and Threshold Check
    threshold_dcs = 0.73  # Define the threshold for DCS
    users_dcs = []  # Initialize a list to store pairs of users with high DCS

    # Iterate through the candidate pairs
    for user1, user2 in candidate_pairs:
        # Retrieve the Minhash signatures for user1 and user2
        minhash_signature1 = minhash_signatures[user1]
        minhash_signature2 = minhash_signatures[user2]

        # Calculate DCS between the Minhash signatures
        dcs = calculate_dcs(minhash_signature1, minhash_signature2)

        # Check if DCS exceeds the threshold
        if dcs > threshold_dcs:
            users_dcs.append((user1, user2))

    # Print the total number of pairs with DCS > threshold for each (b, r) combination
    print(f"For {total_hash_functions} hash functions, b={b} and r={r}, total pairs with DCS > {threshold_dcs}: {len(users_dcs)}")


For 80 hash functions, b=7 and r=11, total pairs with DCS > 0.73: 731
For 90 hash functions, b=8 and r=11, total pairs with DCS > 0.73: 737
For 100 hash functions, b=9 and r=7, total pairs with DCS > 0.73: 101575
For 110 hash functions, b=9 and r=12, total pairs with DCS > 0.73: 501
For 120 hash functions, b=10 and r=11, total pairs with DCS > 0.73: 913
For 130 hash functions, b=11 and r=11, total pairs with DCS > 0.73: 1206
For 140 hash functions, b=12 and r=11, total pairs with DCS > 0.73: 1299
For 150 hash functions, b=13 and r=11, total pairs with DCS > 0.73: 1311


In [130]:
from itertools import combinations

# Number of bands and rows
b = 13
r = 11

# Initialize a dictionary to store buckets
buckets = {}

# Hash Minhash signatures into bands
for user_id in range(num_users):
    minhash_signature = minhash_signatures[user_id]
    for band_id in range(b):
        band_start = band_id * r
        band_end = (band_id + 1) * r
        band_signature = minhash_signature.hashvalues[band_start:band_end]
        
        # Convert the band signature to a hashable string using hash function
        band_signature_str = str(band_signature)
        
        # Add the user to the corresponding bucket
        if band_signature_str not in buckets:
            buckets[band_signature_str] = []
        buckets[band_signature_str].append(user_id)

# Pair generation
user_ids, movie_ids = user_movie_matrix.nonzero()

# Initialize a list to store candidate pairs of users
candidate_pairs = []

# Set the threshold for Jaccard Similarity
threshold_jaccard = 0.5

# Iterate through the buckets created in LSH
for bucket in buckets.values():
    # Generate pairs of users within each bucket
    for user1, user2 in combinations(bucket, 2):
        # Check for common movie ratings
        common_movies = set(movie_ids[user_movie_matrix[user1].nonzero()[1]]) & set(movie_ids[user_movie_matrix[user2].nonzero()[1]])
        
        # Calculate Jaccard Similarity
        denominator = len(set(movie_ids[user_movie_matrix[user1].nonzero()[1]]) | set(movie_ids[user_movie_matrix[user2].nonzero()[1]]))
        jaccard_similarity = len(common_movies) / denominator if denominator != 0 else 0.0
        
        # Include the pair only if there is at least one common movie and Jaccard Similarity is above the threshold
        if common_movies and jaccard_similarity > threshold_jaccard:
            candidate_pairs.append((user1, user2, jaccard_similarity))

# Print the total number of pairs with Jaccard Similarity above the threshold
print(f"Total pairs with Jaccard Similarity > {threshold_jaccard}: {len(candidate_pairs)}")


Total pairs with Jaccard Similarity > 0.5: 1333


In [133]:
# Number of bands and rows
b = 13
r = 11

# Initialize a dictionary to store buckets
buckets = {}

# Hash Minhash signatures into bands
for user_id in range(num_users):
    minhash_signature = minhash_signatures[user_id]
    for band_id in range(b):
        band_start = band_id * r
        band_end = (band_id + 1) * r
        band_signature = minhash_signature.hashvalues[band_start:band_end]
        
        # Convert the band signature to a hashable string using hash function
        band_signature_str = str(band_signature)
        
        # Add the user to the corresponding bucket
        if band_signature_str not in buckets:
            buckets[band_signature_str] = []
        buckets[band_signature_str].append(user_id)

# Pair generation
user_ids, movie_ids = user_movie_matrix.nonzero()

# Initialize a list to store candidate pairs of users
candidate_pairs = []

# Set the threshold for Cosine Similarity
threshold_cosine = 0.73

# Iterate through the buckets created in LSH
for bucket in buckets.values():
    # Generate pairs of users within each bucket
    for user1, user2 in combinations(bucket, 2):
        # Check for common movie ratings
        common_movies = set(movie_ids[user_movie_matrix[user1].nonzero()[1]]) & set(movie_ids[user_movie_matrix[user2].nonzero()[1]])
        
        # Calculate Cosine Similarity
        vector1 = np.zeros(len(movie_ids))
        vector2 = np.zeros(len(movie_ids))
        vector1[list(common_movies)] = 1
        vector2[list(common_movies)] = 1
        
        cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2)) if (np.linalg.norm(vector1) * np.linalg.norm(vector2)) != 0 else 0.0
        
        # Include the pair only if there is at least one common movie and Cosine Similarity is above the threshold
        if common_movies and cosine_similarity > threshold_cosine:
            candidate_pairs.append((user1, user2, cosine_similarity))

# Print the total number of pairs with Cosine Similarity above the threshold
print(f"Total pairs with Cosine Similarity > {threshold_cosine}: {len(candidate_pairs)}")


Total pairs with Cosine Similarity > 0.73: 5315
