In [45]:
# Import Libraries
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
from sklearn.preprocessing import MultiLabelBinarizer

In [46]:
# Load the Datasets
movies = pd.read_csv('movies.csv')
# Rows limited due to training size
ratings = pd.read_csv('ratings.csv', nrows=10000000)

In [47]:
# Preprocess the Movies Data
movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if x != '(no genres listed)' else [])


In [48]:
# One-Hot Encode Genres
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes_, index=movies.index)
movies = pd.concat([movies, genre_dummies], axis=1)


In [49]:
# Calculate Number of Ratings and Average Rating per Movie
movie_stats = ratings.groupby('movieId').agg(
    num_ratings=('rating', 'count'),
    avg_rating=('rating', 'mean')
).reset_index()

In [50]:
# Merge with Movies Data
movies = pd.merge(movies, movie_stats, on='movieId', how='left')


In [51]:
# Fill NaN Values with 0
movies[['num_ratings', 'avg_rating']] = movies[['num_ratings', 'avg_rating']].fillna(0)


In [52]:
# Calculate Weighted Rating Using IMDb Formula
C = movies['avg_rating'].mean()
m = movies['num_ratings'].quantile(0.75)

def weighted_rating(x, m=m, C=C):
    v = x['num_ratings']
    R = x['avg_rating']
    return (v / (v + m) * R) + (m / (v + m) * C)

movies['weighted_rating'] = movies.apply(weighted_rating, axis=1)


In [53]:
# Prepare Data for Surprise
ratings['userId'] = ratings['userId'].astype(str)
ratings['movieId'] = ratings['movieId'].astype(str)

In [54]:
# Prepare Data for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [55]:
# Build the Trainset
trainset = data.build_full_trainset()

# Build and Train the SVD Model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x165858eb140>

In [56]:
# Create a Utility Matrix for Content-Based Filtering
genre_features = movies[mlb.classes_]
genre_features = genre_features.astype(int)

In [57]:
# Function to Recommend Movies Without Retraining the Model
def recommend_movies(user_ratings, preferred_genres, top_n=5):
    # Get IDs of movies the user has rated
    rated_movie_ids = [movie['movieId'] for movie in user_ratings]
    
    # Get unrated movies
    unrated_movies = movies[~movies['movieId'].isin(rated_movie_ids)]
    
    # Filter movies by preferred genres
    if preferred_genres:
        genre_filter = unrated_movies[preferred_genres].any(axis=1)
        candidate_movies = unrated_movies[genre_filter]
    else:
        candidate_movies = unrated_movies.copy()
    
    # Predict ratings for candidate movies
    candidate_movie_ids = candidate_movies['movieId'].tolist()
    predictions = []
    for mid in candidate_movie_ids:
        # Since the user is new, we cannot use the collaborative filtering prediction directly
        # We'll use the movie's average rating as a baseline
        wr = candidate_movies.loc[candidate_movies['movieId'] == mid, 'weighted_rating'].values[0]
        predictions.append((mid, wr))
    
    # Adjust predictions based on content similarity
    # Build a profile for the user based on their rated movies
    user_genre_matrix = movies[movies['movieId'].isin(rated_movie_ids)][mlb.classes_]
    user_profile = user_genre_matrix.mean(axis=0)
    
    # Calculate similarity between user profile and candidate movies
    candidate_genre_matrix = candidate_movies[mlb.classes_]
    similarities = cosine_similarity([user_profile], candidate_genre_matrix)[0]
    
    # Combine weighted ratings and similarities
    final_scores = []
    for idx, (mid, wr) in enumerate(predictions):
        similarity = similarities[idx]
        final_score = (wr * 0.7) + (similarity * 0.3)
        final_scores.append((mid, final_score))
    
    # Get top N recommendations
    final_scores.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = final_scores[:top_n]
    recommended_movie_ids = [mid for (mid, _) in top_recommendations]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    
    return recommended_movies[['title', 'genres', 'weighted_rating']]

# User Input: Movies You've Seen and Your Ratings
all_titles = movies['title'].tolist()

In [62]:
print("Please input movies you've seen and your rating for each (1-5). Type 'done' when finished.\n")

user_ratings = []
while True:
    title = input("Movie Title: ")
    if title.lower() == 'done':
        break
    if title not in all_titles:
        print("Movie not found. Please try again.")
        continue
    rating = input("Your Rating (1-5): ")
    try:
        rating = float(rating)
        if rating < 1 or rating > 5:
            print("Rating must be between 1 and 5.")
            continue
    except ValueError:
        print("Invalid rating. Please enter a number between 1 and 5.")
        continue
    movie_id = movies[movies['title'] == title]['movieId'].values[0]
    user_ratings.append({'movieId': movie_id, 'rating': rating})

# User Input: Preferred Genres
print("\nAvailable Genres:")
print(", ".join(mlb.classes_))

preferred_genres = input("\nEnter the genres you're interested in, separated by commas: ")
preferred_genres = [genre.strip() for genre in preferred_genres.split(',')]

# Validate Genres
for genre in preferred_genres.copy():
    if genre not in mlb.classes_:
        print(f"Genre '{genre}' not found. Please make sure you typed it correctly.")
        preferred_genres.remove(genre)

# Generate Recommendations Without Retraining the Model
recommended_movies = recommend_movies(user_ratings, preferred_genres, top_n=5)

# Display Recommendations
print("\nTop Movie Recommendations:")
for idx, row in recommended_movies.iterrows():
    print(f"{row['title']} ({', '.join(row['genres'])}) - Weighted Rating: {row['weighted_rating']:.2f}")

Please input movies you've seen and your rating for each (1-5). Type 'done' when finished.


Available Genres:
Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western

Top Movie Recommendations:
Pulp Fiction (1994) (Comedy, Crime, Drama, Thriller) - Weighted Rating: 4.20
Shawshank Redemption, The (1994) (Crime, Drama) - Weighted Rating: 4.41
Matrix, The (1999) (Action, Sci-Fi, Thriller) - Weighted Rating: 4.16
Band of Brothers (2001) (Action, Drama, War) - Weighted Rating: 4.39
Parasite (2019) (Comedy, Drama) - Weighted Rating: 4.30


In [60]:
# Evaluation Code: Cross-Validation and Metrics

from surprise import accuracy
from surprise.model_selection import cross_validate, KFold

# Prepare data for cross-validation
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Use 5-fold cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

# Initialize the SVD algorithm
algo = SVD()

# Perform cross-validation and collect results
print("Performing cross-validation...")

cv_results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=kf, verbose=True)

# Calculate Mean Squared Error (MSE) manually
mse_values = []

for trainset, testset in kf.split(data):
    # Train the algorithm on the trainset
    algo.fit(trainset)
    # Test the algorithm on the testset
    predictions = algo.test(testset)
    # Compute MSE
    mse = accuracy.mse(predictions, verbose=False)
    mse_values.append(mse)

print("\nCross-Validation Results:")
print(f"Average RMSE: {np.mean(cv_results['test_rmse']):.4f}")
print(f"Average MAE: {np.mean(cv_results['test_mae']):.4f}")
print(f"Average MSE: {np.mean(mse_values):.4f}")


Performing cross-validation...
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7834  0.7833  0.7839  0.7833  0.7822  0.7832  0.0006  
MAE (testset)     0.5914  0.5910  0.5915  0.5910  0.5906  0.5911  0.0003  
Fit time          104.18  106.29  114.48  99.64   112.40  107.40  5.42    
Test time         39.07   38.46   34.43   41.13   32.20   37.06   3.26    

Cross-Validation Results:
Average RMSE: 0.7832
Average MAE: 0.5911
Average MSE: 0.6133
