# Table of Contents
1. [Introduction](#introduction)
2. [Setup and Data Loading](#setup-and-data-loading)
3. [Data Preprocessing](#data-preprocessing)
4. [Sentiment Analysis](#sentiment-analysis)
5. [Collaborative Filtering](#collaborative-filtering)
6. [Content-Based Filtering](#content-based-filtering)
7. [Hybrid Recommendation System](#hybrid-recommendation-system)
8. [Results Compilation and Analysis](#results-compilation-and-analysis)
9. [Performance Metrics](#performance-metrics)
10. [Conclusions and Future Work](#conclusions-and-future-work)


# Introduction
This Jupyter Notebook documents the process of building a hybrid movie recommendation system. The system integrates collaborative and content-based filtering approaches, enhanced with sentiment analysis and natural language processing. Our objective is to explore various recommendation strategies and evaluate their performance based on a sample dataset.


In [1]:
# Setup and Data Loading
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import gc  # Garbage Collector interface

# Load the data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')


# Data Preprocessing
In this section, we perform initial data cleaning, handle missing values, and prepare the data for further analysis.


In [2]:
# Combine datasets for further analysis if needed
combined_data = pd.merge(movies, ratings, on='movieId')

## Content-Based

In [3]:
# Define function for genre-based recommendations
def content_based_recommendations_genres(movie_index, cosine_sim, n=10):
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Exclude self
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['title', 'genres']]

In [4]:
# Convert genres to TF-IDF features and calculate cosine similarity matrix
tfidf_vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])
cosine_sim_genres = cosine_similarity(tfidf_matrix, tfidf_matrix)

MemoryError: Unable to allocate 29.0 GiB for an array with shape (62423, 62423) and data type float64

In [None]:
# Define function for NER-based recommendations
nlp = spacy.load('en_core_web_sm')  # Load SpaCy English model
def extract_named_entities(title):
    doc = nlp(title)
    return ' '.join([ent.text for ent in doc.ents])

In [None]:
# Apply NER function to the movies dataset
temp_movies = movies.copy()
temp_movies['title_entities'] = temp_movies['title'].apply(extract_named_entities)

In [None]:
# Convert named entities to TF-IDF features and calculate cosine similarity matrix
tfidf_vectorizer_entities = TfidfVectorizer()
tfidf_matrix_entities = tfidf_vectorizer_entities.fit_transform(temp_movies['title_entities'])
cosine_sim_entities = cosine_similarity(tfidf_matrix_entities, tfidf_matrix_entities)


In [None]:
# Define function to generate recommendations based on NER
def content_based_recommendations_entities(movie_index, cosine_sim, n=10):
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Exclude self
    movie_indices = [i[0] for i in sim_scores]
    return temp_movies.iloc[movie_indices][['title', 'title_entities']]

In [None]:
# Function to combine genre and NER recommendations with weights
def weighted_combined_recommendations(movie_index, cosine_sim_genres, cosine_sim_ner, movies, temp_movies, weight_genre=0.8, weight_ner=0.2, n=10):
    # Genre-based recommendations
    genre_indices = content_based_recommendations_genres(movie_index, cosine_sim_genres, n*2).index
    
    # NER-based recommendations
    ner_indices = content_based_recommendations_entities(movie_index, cosine_sim_ner, n).index
    
    # Initialize dictionary to keep track of scores
    combined_scores = {}
    
    # Assign scores based on genre similarity
    for idx in genre_indices:
        combined_scores[idx] = combined_scores.get(idx, 0) + weight_genre
    
    # Update scores based on NER similarity
    for idx in ner_indices:
        if idx in combined_scores:
            # If the movie is already recommended by genre, enhance its score
            combined_scores[idx] += weight_ner
        else:
            # If not, add it with the NER weight
            combined_scores[idx] = weight_ner
    
    # Sort movies by combined score
    sorted_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Select top n movies
    top_indices = [idx for idx, score in sorted_scores[:n]]
    
    # Fetch top movies from the original movies DataFrame
    top_movies = movies.loc[top_indices]
    
    return top_movies[['movieId', 'title', 'genres']]


In [None]:
movie_index = 0  # Index of a sample movie

# Recommendations based on genres
genre_recommendations = content_based_recommendations_genres(movie_index, cosine_sim_genres)
print("Recommendations based on genres:")
print(genre_recommendations)

In [None]:
movie_index = 0  # Index of a sample movie from the original 'movies' DataFrame

# Recommendations based on NER (using the temporary dataset)
ner_recommendations = content_based_recommendations_entities(movie_index, cosine_sim_entities)
print("Recommendations based on NER extracted from titles:")
print(ner_recommendations)

In [None]:
movie_index = 0  # Index of a sample movie from the original 'movies' DataFrame

# Generate weighted combined recommendations
weighted_recommendations = weighted_combined_recommendations(movie_index, cosine_sim_genres, cosine_sim_entities, movies, temp_movies, n=10)
print("Weighted combined recommendations (80% genre, 20% NER):")
for index, row in weighted_recommendations.iterrows():
    print(f"{row['title']} - Genres: {row['genres']}")

Collaborative Filering with Seniment Analysis included

In [None]:
# Define sentiment categorization function
def categorize_sentiment(rating):
    if rating >= 4.0:
        return 'positive'
    elif rating <= 2.0:
        return 'negative'
    else:
        return 'neutral'

In [None]:
# Apply sentiment categorization to ratings
ratings['sentiment'] = ratings['rating'].apply(categorize_sentiment)

In [None]:
# Load the Surprise dataset for collaborative filtering
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [None]:
# Train the collaborative filtering model using SVD
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

In [None]:
# Define the SVD recommendation function
def svd_recommendations(user_id, movies, algo, n=10):
    predictions = []
    for movie_id in movies['movieId'].unique():
        prediction = algo.predict(user_id, movie_id)
        predictions.append((movie_id, prediction.est))
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_movie_ids = [movie_id for movie_id, _ in predictions[:n]]
    top_recommendations = [(movies[movies['movieId'] == movie_id]['title'].iloc[0], score) for movie_id, score in predictions[:n]]
    return top_recommendations

# Define a function to train the RandomForestClassifier for sentiment analysis
def train_classifier_in_batches(data, batch_size=10000):
    classifier = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
    for start in range(0, data.shape[0], batch_size):
        end = start + batch_size
        batch_data = data.iloc[start:end]
        classifier.fit(batch_data[['userId', 'movieId']], batch_data['sentiment'])
    return classifier

In [None]:
# Train the classifier in batches
classifier = train_classifier_in_batches(ratings[['userId', 'movieId', 'sentiment']])

In [None]:
# Hybrid Recommendations Function
def hybrid_recommendation_with_sentiment(user_id, movies, algo, classifier, n=10):
    svd_predictions = []
    for movie_id in movies['movieId'].unique():
        prediction = algo.predict(uid=str(user_id), iid=str(movie_id))
        svd_predictions.append((movie_id, prediction.est))
    svd_predictions.sort(key=lambda x: x[1], reverse=True)
    top_svd_movie_ids = [movie_id for movie_id, _ in svd_predictions[:n]]

    adjusted_scores = []
    for movie_id in top_svd_movie_ids:
        predicted_sentiment = classifier.predict([[user_id, movie_id]])[0]
        original_score = next(score for mid, score in svd_predictions if mid == movie_id)
        adjusted_score = original_score + 0.1 if predicted_sentiment == 'positive' else original_score - 0.1
        adjusted_scores.append((movie_id, adjusted_score))

    adjusted_scores.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = [(movies[movies['movieId'] == movie_id]['title'].iloc[0], score) for movie_id, score in adjusted_scores[:n]]
    return top_recommendations



In [None]:
# Example usage
user_id = random.randint(0, 7000)  # Random user ID for demonstration

# SVD Recommendations
top_svd_recommendations = svd_recommendations(user_id, movies, algo, n=10)
print("Top recommendations based on SVD:")
for movie_title, estimated_rating in top_svd_recommendations:
    print(f"Movie Title: {movie_title}, Estimated Rating: {estimated_rating}")

In [None]:
# # Example usage
# user_id = random.randint(65001, 65011)  # Random user ID for demonstration

# Hybrid Recommendations
top_hybrid_recommendations = hybrid_recommendation_with_sentiment(user_id, movies, algo, classifier, n=10)
print("\nTop recommendations based on Hybrid model with sentiment adjustment:")
for movie_title, score in top_hybrid_recommendations:
    print(f"Movie Title: {movie_title}, Score: {score}")


In [None]:
#experiment to see if random sentiment score included to SVD. Under preview and can be ignored 
def hybrid_recommendation_with_sentiment1(user_id, movies, algo, n=10):
    # Generate SVD recommendations
    svd_predictions = svd_recommendations(user_id, movies, algo, n)

    # Adjust recommendations based on sentiment
    adjusted_recommendations = []
    for movie_id, svd_score in svd_predictions:
        # Simulate sentiment score retrieval
        sentiment_score = np.random.rand()
        sentiment_weight = 0.8 + (sentiment_score * 0.4)
        adjusted_score = svd_score * sentiment_weight
        adjusted_recommendations.append((movie_id, adjusted_score))

    # Sort by adjusted scores and select top N recommendations
    adjusted_recommendations.sort(key=lambda x: x[1], reverse=True)
    top_adjusted_recommendations = adjusted_recommendations[:n]

#     # Fetch movie titles for the top recommendations
#     top_recommendations = []
#     for movie_id, score in top_adjusted_recommendations:
#         filtered_movies = movies[movies['movieId'] == movie_id]
#         if not filtered_movies.empty:
#             movie_title = filtered_movies['title'].iloc[0]
#             top_recommendations.append((movie_title, score))

    return top_adjusted_recommendations



In [None]:
# # Example usage
# user_id = random.randint(1, 7000)  # Assuming user IDs are between 1 and 610
top_hybrid_recommendations = hybrid_recommendation_with_sentiment1(user_id, movies, algo, n=10)

print("Top recommendations based on Hybrid model with sentiment adjustment:")
for movie_title, score in top_hybrid_recommendations:
    print(f"Movie Title: {movie_title}, Adjusted Score: {score:.2f}")

In [None]:
# Function to compile all recommendation results
def compile_recommendation_results(user_id, movie_idx, movies, algo, classifier, cosine_sim_genres, cosine_sim_entities, n=10):
    results = pd.DataFrame(index=range(n), columns=[
         'Genre_Based', 'NER_Based', 'Weighted_Genre_NER', 'Final_Hybrid','SVD', 'SVD_Sentiment'
    ])
    

    
    # Genre-Based Recommendations
    genre_based_results = content_based_recommendations_genres(movie_idx, cosine_sim_genres, n)
    results['Genre_Based'] = genre_based_results['title'].tolist()
    
    # NER-Based Recommendations
    ner_based_results = content_based_recommendations_entities(movie_idx, cosine_sim_entities, n)
    results['NER_Based'] = ner_based_results['title'].tolist()
    
    # Weighted Genre + NER Recommendations
    weighted_genre_ner_results = weighted_combined_recommendations(movie_idx, cosine_sim_genres, cosine_sim_entities, movies, temp_movies, n=n)
    results['Weighted_Genre_NER'] = weighted_genre_ner_results['title'].tolist()
    
    # SVD Recommendations
    svd_results = svd_recommendations(user_id, movies, algo, n)
    results['SVD'] = [r[0] for r in svd_results]
    
    # SVD with Sentiment Adjustments
    svd_sentiment_results = hybrid_recommendation_with_sentiment(user_id, movies, algo, classifier, n)
    results['SVD_Sentiment'] = [r[0] for r in svd_sentiment_results]
    
    # Final Hybrid Recommendations
    # This is a conceptual step that could potentially be based on a more complex logic
    # For simplicity, we'll take an average of the SVD scores and the content-based scores for each movie
    final_hybrid_scores = {movie_id: (svd_score + content_score) / 2
                           for movie_id, svd_score, content_score in zip(
                               weighted_genre_ner_results['movieId'],
                               [r[1] for r in svd_results],
                               range(1, n+1)  # Mock scores for content-based recommendations
                           )}
    # Sort movies by the final hybrid score
    sorted_final_hybrid = sorted(final_hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    # Fetch the top n recommendations
    final_hybrid_top_n = [movies[movies['movieId'] == movie_id]['title'].iloc[0] for movie_id, _ in sorted_final_hybrid][:n]
    results['Final_Hybrid'] = final_hybrid_top_n
    
    return results



In [None]:
# Example usage
user_id = random.randint(0, 7400)  # Random user ID for demonstration
movie_idx = 0  # Index for "Toy Story" in the movies DataFrame

# Assuming cosine_sim_genres and cosine_sim_entities are already defined from Part 1
recommendation_results = compile_recommendation_results(user_id, movie_idx, movies, algo, classifier, cosine_sim_genres, cosine_sim_entities, n=10)
recommendation_results.head(10)