# Hybrid Modeling

In this notebook, I attempt to combine collaborative filtering and content-based filtering models to generate a better recommendation system.

In [1]:
# Import necessary libraries and submodules
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from surprise import Dataset, Reader
from surprise import SVD # implementation of Funk's SVD (gradient descent-based matrix factorization)
from surprise import accuracy # metric
from surprise.model_selection import train_test_split, GridSearchCV #train/test splits, crossval
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### Load and prepare data

In [3]:
# Load datasets
ratings = pd.read_csv('../data/ratings.csv')
movies = pd.read_csv('../data/movies.csv')
tags = pd.read_csv('../data/tags.csv')

# Fill missing values in genres and tags
movies['genres'] = movies['genres'].fillna('')
tags['tag'] = tags['tag'].fillna('')

# Split genres into lists for one-hot encoding
movies['genres_split'] = movies['genres'].str.split('|')

# Map movieId to matrix indices
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movies['movieId'])}
index_to_movie_id = {idx: movie_id for movie_id, idx in movie_id_to_index.items()}

#### Content-based feature engineering

In [4]:
# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(movies['genres_split'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Vectorize tags using TF-IDF
tag_vectorizer = TfidfVectorizer(max_features=500)
tag_features = tag_vectorizer.fit_transform(tags['tag'])

# Combine genres and tags
content_features = pd.concat([pd.DataFrame(genre_encoded), pd.DataFrame(tag_features.toarray())], axis=1).fillna(0)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(content_features)

#### Collaborative filtering using SVD

In [None]:
# Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x156ee0350>

#### Hybrid recommender

In [None]:
# Function for getting hybrid recommendations
def hybrid_recommendations(user_id, train_data, svd_model, similarity_matrix, movies, top_n=10, alpha=0.7):
    """
    Generate hybrid recommendations using SVD (collaborative filtering) and content-based filtering.
    """
    # Get all movies
    all_movie_ids = movies['movieId'].tolist()

    # Get movies the user has already rated
    rated_movies = train_data[train_data['userId'] == user_id]['movieId'].tolist()

    # Filter out rated movies
    unrated_movies = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movies]

    recommendations = []
    for movie_id in unrated_movies:
        # Collaborative Filtering Score
        cf_score = svd_model.predict(user_id, movie_id).est

        # Content-Based Score
        if rated_movies:
            cb_score = np.mean(
                [similarity_matrix[movie_id_to_index[movie_id], movie_id_to_index[other_movie_id]]
                 for other_movie_id in rated_movies if other_movie_id in movie_id_to_index]
            )
        else:
            cb_score = 0  # No content-based score if no rated movies

        # Combine scores
        hybrid_score = alpha * cf_score + (1 - alpha) * cb_score
        recommendations.append((movie_id, hybrid_score))

    # Sort recommendations by score
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]

    # Retrieve movie details
    recommended_movie_ids = [rec[0] for rec in recommendations]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)].copy()
    recommended_movies['score'] = [rec[1] for rec in recommendations]

    return recommended_movies

#### Generate recommendations

In [None]:
# Generate recommendations for user 1
user_id = 1
recommended_movies = hybrid_recommendations(
    user_id=user_id,
    train_data=ratings,
    svd_model=svd_model,
    similarity_matrix=similarity_matrix,
    movies=movies,
    top_n=10,
    alpha=0.7  # Adjust alpha to balance CF and CB contributions
)

# Display the recommendations
print(recommended_movies[['title', 'genres', 'score']])

                                                  title  \
690                           North by Northwest (1959)   
903   Good, the Bad and the Ugly, The (Buono, il bru...   
906                           Lawrence of Arabia (1962)   
924     Grand Day Out with Wallace and Gromit, A (1989)   
933                        Boot, Das (Boat, The) (1981)   
960                  Evil Dead II (Dead by Dawn) (1987)   
1494        Seven Samurai (Shichinin no samurai) (1954)   
4176                City of God (Cidade de Deus) (2002)   
7060                            Hurt Locker, The (2008)   
7214                             Sherlock Holmes (2009)   

                                          genres     score  
690    Action|Adventure|Mystery|Romance|Thriller  3.600033  
903                     Action|Adventure|Western  3.574812  
906                          Adventure|Drama|War  3.573116  
924   Adventure|Animation|Children|Comedy|Sci-Fi  3.572606  
933                             Action|Drama|

### Evaluate hybrid model