Scaling Score:
- Min-Max Scaling (0-100) = \frac{score - min(scores)}{max(scores) - min(scores)} x 100
- Cosine Similarity (0-1) = \frac{cosine_similarity + 1}{2}

Considerations:
- if similarity score is too low, user won't like it => how to artificially crank them up


In [None]:
import numpy as np
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:


# Step 1: Create sample data (User-Content interactions)
data = pd.DataFrame({
    'user': [1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
    'content': [1, 2, 5, 1, 4, 3, 4, 2, 3, 1, 4],
    'rating':  [5, 3, 4, 4, 1, 0, 5, 1, 5, 2, 5]  # Listening frequency or favorites
})

# Step 2: Prepare Data for Surprise
reader = Reader(rating_scale=(0, 5))
dataset = Dataset.load_from_df(data[['user', 'content', 'rating']], reader)

# Step 3: Train an SVD Model
trainset, testset = train_test_split(dataset, test_size=0.2)
model = SVD(n_factors=10)  # Latent factors
model.fit(trainset)

# Step 4: Predict & Evaluate
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

# Step 5: Get Content Similarity from Matrix
content_factors = model.qi  # Content latent factors
similarity_matrix = np.dot(content_factors, content_factors.T)  # Cosine similarity

print("Content Similarity Matrix:\n", similarity_matrix)


In [None]:
def get_content_similarity(model, user_id: int):
    """
    Compute dot product between U[user] and content matrix
    """
    user_vector = model.pu[user_id]
    content_matrix = model.qi
    # print(f"User Vector Shape: {user_vector.shape}")
    # print(f"Content Vector Shape: {content_vector.shape}")

    # - compute similarity score and make them fall between 0-100 with MinMaxScaler
    content_scores = np.dot(content_matrix, user_vector)
    scaler = MinMaxScaler(feature_range=(0, 100))
    normalized_scores = scaler.fit_transform(content_scores.reshape(-1, 1)).flatten()

    # - normalize using cosine similarity
    # cosine_scores = cosine_similarity([user_vector], content_vectors).flatten()
    # normalized_scores = (cosine_scores + 1) / 2

    # TODO: recommend content user hasn't listened to yet
    recommended_contents = np.argsort(normalized_scores)[::-1]
    return normalized_scores, recommended_contents

In [None]:
content_score, recommended_contents = get_recommended_content(model, 0)

In [None]:
def get_content_recommendations(model, liked_contents: [int]):
    """
    1. Get content the user liked (from listening or favorites)
    2. Retrieve latent vector V from those liked contents
    3. Compute cosine similarity between liked content and all other content
    4. return top similar content
    """
    content_vectors = model.qi
    liked_vectors = content_vectors[liked_contents]
    similarities = cosine_similarity(liked_vectors, content_vectors)
    content_similarity_scores = np.mean(similarities, axis=0)
    similar_content = np.argsort(content_similarity_scores)[::-1]
    return similar_content

In [None]:
get_content_recommendations(model, [0, 3]) # why not same shape as num of content?