<a href="https://colab.research.google.com/github/the-datastrategist/colab-notebooks/blob/main/tds_memory_collaborative_filtering_01_load_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Memory-Based Collaborative Filtering

In this notebook, I'm build a memory-based collaborative filtering model. We're taking data from 25M movie ratings.

__Resources__
- [Towards Data Science: How Collaborative Filtering Works](https://towardsdatascience.com/how-does-collaborative-filtering-work-da56ea94e331)
- [IMDB Non-Commercial Datasets](https://developer.imdb.com/non-commercial-datasets/)
- [GCS: butterstick2023](https://console.cloud.google.com/storage/browser/butterstick2023/ml-25m;tab=objects?project=the-data-strategist&prefix=&forceOnObjectsSortingFiltering=false)

## Project Setup

In [None]:
# Provide your credentials to the runtime
from google.colab import auth
auth.authenticate_user()

In [None]:
# Import commonly-used libraries
import pandas as pd
from datetime import datetime as dt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import random
from google.cloud import storage
import json


# Declare global variables
GOOGLE_PROJECT_ID = 'the-data-strategist'
OBJECTIVE_METRIC = 'rating'
USER_VARIABLE = 'userId'
PRODUCT_VARIABLE = 'movieId'
N_MOVIES = 1_000
N_USERS = 100

## Data Import

In [None]:
def import_ratings(ratings_filename='gs://butterstick2023/ml-25m/ratings.csv'):
    """Import ratings data and make transformations.
    """
    # Import ratings dataset
    df_ratings = pd.read_csv(ratings_filename)

    # Convert timestamp to a datetime
    df_ratings['date_time'] = pd.to_datetime(df_ratings['timestamp'], unit='s')
    df_ratings.head()

    # Get each user's average rating
    # Will weight individual ratings by the average rating
    df_user_avg_rating = df_ratings.groupby('userId')['rating'].mean()
    df_user_avg_rating = df_user_avg_rating.to_frame().reset_index()
    df_user_avg_rating.columns = ['userId', 'avg_rating']

    # Join back to df_ratings
    df_ratings = df_ratings.join(df_user_avg_rating, on='userId', rsuffix='_')
    df_ratings['wt_rating'] = df_ratings['rating'] / df_ratings['avg_rating']
    df_ratings['liked'] = [True if r > ar else False for r, ar in zip(df_ratings['rating'], df_ratings['avg_rating'])]

    return df_ratings


In [None]:
def get_sample_df(df, n_movie_sample = N_MOVIES, n_user_sample = N_USERS):
    """Get a sample of df_ratings based on a certain number of userIds
    and movieIds.
    """
    n_movies = n_movie_sample or df['movieId'].nunique()
    n_users = n_user_sample or df['userId'].nunique()

    movie_sample_list = random.sample(list(set(df['movieId'])), n_movies)
    user_sample_list = random.sample(list(set(df['userId'])), n_users)
    df_sample = df[
        (df['userId'].isin(user_sample_list)) & (df['movieId'].isin(movie_sample_list))]
    return df_sample

In [None]:
def get_user_item_matrix(
    df_ratings,
    user_variable=USER_VARIABLE,
    product_variable=PRODUCT_VARIABLE
    ):
    # Get user-item matrix
    user_item_matrix = df_ratings.pivot(
        index=USER_VARIABLE,
        columns=PRODUCT_VARIABLE,
        values=OBJECTIVE_METRIC
        ).fillna(0)
    return user_item_matrix

def get_item_similarity_matrix(df_ratings):
    user_item_matrix = get_user_item_matrix(df_ratings)
    item_sim_matrix = cosine_similarity(csr_matrix(user_item_matrix.T))
    return item_sim_matrix

def get_user_similarity_matrix(df_ratings):
    user_item_matrix = get_user_item_matrix(df_ratings)
    user_sim_matrix = cosine_similarity(csr_matrix(user_item_matrix))
    return user_sim_matrix


In [None]:
def read_json_data(data, index_name='userId', column_name='movieId'):
    df = pd.read_json(data)
    df.index.name = index_name
    df.columns.name = column_name
    return df

def load_json_to_gcs(
    json_data,
    bucket_name,
    blob_name
):
    # Initialize the client
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)

    # Name of the object in the bucket
    blob = bucket.blob(blob_name)

    # Save the JSON string to the file in the bucket
    blob.upload_from_string(json_data, content_type='application/json')

    print(f"File {blob_name} uploaded to {bucket_name}.")



## Load similarity matrices

Generate the following matrices and load to Google Storage:
- user_item_metrix
- item_similarity_matrix
- user_similarity_matrix

In [None]:
# Get all ratings
df_ratings = import_ratings()

df_ratings.agg({
    'userId': 'nunique',
    'movieId': 'nunique'
})

userId     162541
movieId     59047
dtype: int64

In [None]:
# Get sample of ratings
df_ratings_sample = get_sample_df(
    df_ratings,
    n_movie_sample = None,
    n_user_sample = 5_000
    )
del df_ratings
df_ratings_sample.head()

Unnamed: 0,userId,movieId,rating,timestamp,date_time,userId_,avg_rating,wt_rating,liked
4386,27,110,3.0,974256534,2000-11-15 02:48:54,28.0,4.693878,0.63913,False
4387,27,260,2.0,974513448,2000-11-18 02:10:48,28.0,4.693878,0.426087,False
4388,27,356,5.0,974257923,2000-11-15 03:12:03,28.0,4.693878,1.065217,True
4389,27,408,5.0,974256887,2000-11-15 02:54:47,28.0,4.693878,1.065217,True
4390,27,527,4.0,974256577,2000-11-15 02:49:37,28.0,4.693878,0.852174,False


In [None]:
df_ratings_sample.agg({
    'userId': 'nunique',
    'movieId': 'nunique'
})

userId      5000
movieId    20208
dtype: int64

### Get user-item matrix

In [None]:
# Get User-Item Maxtrix
user_item_matrix = get_user_item_matrix(df_ratings_sample)
user_item_matrix_json = user_item_matrix.to_json()
user_item_matrix_json

In [None]:
load_json_to_gcs(
    user_item_matrix_json,
    bucket_name = 'butterstick2023',
    blob_name = 'recommendation_system/user_item_matrix_json.json'
)


### Get similarity matrices

In [None]:
# G
item_sim_matrix = get_item_similarity_matrix(df_ratings)
item_sim_matrix_json = item_sim_matrix.to_json()
item_sim_matrix_json

In [None]:
load_json_to_gcs(
    item_sim_matrix_json,
    bucket_name = 'butterstick2023',
    blob_name = 'recommendation_system/item_sim_matrix_json.json'
)


## Get Recommendations

In [None]:
# Generating Recommendations
def recommend_movies(movie_id, item_similarity, matrix, k=5):
    # Find the movie index from the user-item matrix
    movie_idx = list(matrix.columns).index(movie_id)
    # Get movie similarities
    movie_similarities = item_similarity[movie_idx]
    # Get top k most similar movie indices
    similar_movie_idxs = np.argsort(-movie_similarities)[1:k+1]
    # Get the movie IDs
    similar_movies = matrix.columns[similar_movie_idxs]
    return similar_movies


In [None]:
recommendations = recommend_movies(
    movie_id=5,
    item_similarity=item_sim_matrix,
    matrix=user_item_matrix,
    k=10
    )


In [None]:
recommendations

Int64Index([1680, 830, 628, 3591, 2581, 275, 719, 224, 1541, 3705], dtype='int64', name='movieId')

In [None]:
def predict_ratings(similarity_matrix, matrix):
    ratings_pred = similarity_matrix.dot(matrix) / np.array([np.abs(similarity_matrix).sum(axis=1)]).T
    return ratings_pred


In [None]:
user_rating_predictions = predict_ratings(
    similarity_matrix=user_sim_matrix,
    matrix=user_item_matrix
    )

In [None]:
# user_rating_predictions

len(user_sim_matrix[0])

len(user_sim_matrix)


100