### Import and Data Loading

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re


In [None]:
# Load the movies dataset
movies = pd.read_csv('movies.csv')
print("Movies dataset preview:")


In [None]:
# Load the ratings dataset
ratings = pd.read_csv('ratings.csv')
print("Ratings dataset preview:")


### Data Preprocessing

In [None]:
# Extract release year from 'title'
def extract_year(title):
    year_match = re.search(r'\((\d{4})\)', title)
    if year_match:
        return int(year_match.group(1))
    else:
        return np.nan

movies['release_year'] = movies['title'].apply(extract_year)
movies = movies.dropna(subset=['release_year'])
movies['release_year'] = movies['release_year'].astype(int)


In [None]:
# Create 'is_new' feature based on release year (e.g., movies released in 2015 or later)
movies['is_new'] = movies['release_year'].apply(lambda x: 1 if x >= 2015 else 0)


In [None]:
# Process genres (split by '|')
movies['genres'] = movies['genres'].str.split('|')

# Create a list of unique genres
genre_list = set()
for genres in movies['genres']:
    genre_list.update(genres)
genre_list = list(genre_list)


In [None]:
# One-hot encode genres
for genre in genre_list:
    movies[genre] = movies['genres'].apply(lambda x: int(genre in x))


In [None]:
# Merge movies and ratings data
data = pd.merge(ratings, movies, on='movieId')

# Create a mapping from movie titles to movieIds
title_to_movieId = pd.Series(movies.movieId.values, index=movies.title).to_dict()


### Building the Item Similarity Matrix

In [None]:
# Create a user-item rating matrix
user_item_matrix = data.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)


In [None]:
# Compute cosine similarity between movies
item_similarity = cosine_similarity(user_item_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)


### Recommendation System

In [None]:
def get_recommendations(movies_seen, preferred_genres=None, preferred_age='New', num_recommendations=10):
    # Validate movies_seen
    valid_movies = movies[movies['title'].isin(movies_seen)]
    if valid_movies.empty:
        print("None of the movies you entered were found in the dataset.")
        return pd.DataFrame()
    
    # Get the movieIds of the movies seen
    movies_seen_ids = valid_movies['movieId'].values
    print(f"\nMovies found in dataset: {valid_movies['title'].tolist()}")
    
    # Calculate similarity scores for movies not seen
    similarity_scores = item_similarity_df.loc[movies_seen_ids].mean(axis=0)
    
    # Create a DataFrame with similarity scores
    similarity_df = pd.DataFrame({
        'movieId': similarity_scores.index,
        'similarity_score': similarity_scores.values
    })
    
    # Exclude movies already seen
    similarity_df = similarity_df[~similarity_df['movieId'].isin(movies_seen_ids)]
    
    # Merge with movies DataFrame
    recommendations = pd.merge(similarity_df, movies.drop_duplicates('movieId'), on='movieId')
    
    # Filter based on preferred genres
    if preferred_genres:
        # Ensure genres are in the genre list
        preferred_genres = [genre for genre in preferred_genres if genre in genre_list]
        if not preferred_genres:
            print("None of the preferred genres are available in the dataset.")
            return pd.DataFrame()
        genre_filter = recommendations[preferred_genres].sum(axis=1) > 0
        recommendations = recommendations[genre_filter]
    
    # Filter based on 'is_new' feature
    if preferred_age == 'New':
        recommendations = recommendations[recommendations['is_new'] == 1]
    elif preferred_age == 'Old':
        recommendations = recommendations[recommendations['is_new'] == 0]
    
    # Exclude movies already seen
    recommendations = recommendations[~recommendations['movieId'].isin(movies_seen_ids)]
    
    # Sort by similarity score
    recommendations = recommendations.sort_values(by='similarity_score', ascending=False)
    
    # Return top N recommendations
    return recommendations[['title', 'genres', 'release_year', 'similarity_score']].head(num_recommendations)


### User Input Placeholders

In [None]:
# Placeholder for movies the user has already seen
# Replace the list below with the titles of movies you've seen
movies_seen = [
    'Toy Story (1995)',
    'Jumanji (1995)'
    # Add more movie titles as needed
]

# Validate movies_seen
valid_movies = movies[movies['title'].isin(movies_seen)]
if valid_movies.empty:
    print("None of the movies you entered were found in the dataset.")
else:
    print(f"\nMovies found in dataset: {valid_movies['title'].tolist()}")


In [None]:
# Placeholder for preferred age of movies
# Set to 'New' or 'Old'
preferred_age = 'New'  # 'New' or 'Old'
print(f"\nPreferred age of movies: {preferred_age}")


### Generating Recommendations

In [None]:
# Number of recommendations to generate
num_recommendations = 10

# Generate recommendations
recommendations = get_recommendations(
    movies_seen,
    preferred_genres=preferred_genres,
    preferred_age=preferred_age,
    num_recommendations=num_recommendations
)

# Display the recommendations
if recommendations.empty:
    print("\nNo recommendations could be generated based on the inputs provided.")
else:
    print(f"\nTop {num_recommendations} movie recommendations based on your preferences:")
    display(recommendations)


### Testing

In [None]:
# Display similarity scores for the top recommendations
print("\nSimilarity scores for the top recommendations:")
display(recommendations[['title', 'similarity_score']])


In [None]:
# Plot the distribution of similarity scores
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(recommendations['similarity_score'], bins=20, kde=True)
plt.title('Distribution of Similarity Scores')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Evaluate the coverage of preferred genres in the recommendations
genre_counts = recommendations[preferred_genres].sum()
print("\nGenre coverage in recommendations:")
print(genre_counts)
