## System 1

In [8]:
import pandas as pd

# Load datasets
ratings_path = 'ratings.dat'  # Path to ratings.dat
movies_path = 'movies.dat'  # Path to movies.dat

# Read the datasets with proper encoding and separator
ratings_df = pd.read_csv(ratings_path, sep='::', engine='python', header=None, encoding='latin-1')
movies_df = pd.read_csv(movies_path, sep='::', engine='python', header=None, encoding='latin-1')

# Assign proper column names
ratings_df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies_df.columns = ['MovieID', 'Title', 'Genres']

# Step 1: Compute Popularity Metrics
# Aggregate the number of ratings and average rating for each movie
movie_stats = ratings_df.groupby('MovieID').agg(
    num_ratings=('Rating', 'count'),
    avg_rating=('Rating', 'mean')
).reset_index()

# Step 2: Merge with Movie Titles
movie_stats = movie_stats.merge(movies_df[['MovieID', 'Title']], on='MovieID')

# Step 3: Sort by Popularity (number of ratings and average rating)
popular_movies = movie_stats.sort_values(by=['num_ratings', 'avg_rating'], ascending=False)

# Step 4: Select the Top 10 Movies
top_10_movies = popular_movies.head(10)

# Step 5: Save Results to an HTML File
output_html = top_10_movies[['MovieID', 'Title', 'num_ratings', 'avg_rating']].to_html(index=False)
with open("top_10_popular_movies.html", "w") as f:
    f.write(output_html)

# Print the top 10 movies
print("Top 10 Most Popular Movies:")
print(top_10_movies[['MovieID', 'Title', 'num_ratings', 'avg_rating']])


Top 10 Most Popular Movies:
      MovieID                                              Title  num_ratings  \
2651     2858                             American Beauty (1999)         3428   
253       260          Star Wars: Episode IV - A New Hope (1977)         2991   
1106     1196  Star Wars: Episode V - The Empire Strikes Back...         2990   
1120     1210  Star Wars: Episode VI - Return of the Jedi (1983)         2883   
466       480                               Jurassic Park (1993)         2672   
1848     2028                         Saving Private Ryan (1998)         2653   
575       589                  Terminator 2: Judgment Day (1991)         2649   
2374     2571                                 Matrix, The (1999)         2590   
1178     1270                          Back to the Future (1985)         2583   
579       593                   Silence of the Lambs, The (1991)         2578   

      avg_rating  
2651    4.317386  
253     4.453694  
1106    4.292977  
1120

## System 2

In [9]:
import numpy as np
import pandas as pd

# Step 1: Load and Preprocess the Data
ratings_path = 'ratings.dat'  # Path to ratings.dat
movies_path = 'movies.dat'  # Path to movies.dat

ratings_df = pd.read_csv(ratings_path, sep='::', engine='python', header=None, encoding='latin-1')
ratings_df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

# Create a pivot table (rating matrix) where rows are users and columns are movies
rating_matrix = ratings_df.pivot(index='UserID', columns='MovieID', values='Rating')

# Step 2: Normalize the Rating Matrix
# Subtract the mean of each user's ratings (ignoring NaNs)
rating_matrix_centered = rating_matrix.sub(rating_matrix.mean(axis=1), axis=0)

# Step 3: Compute Cosine Similarity
# Define a function to calculate cosine similarity
def cosine_similarity(matrix):
    norm = np.sqrt(np.nansum(matrix**2, axis=0))  # Compute norms for each column
    similarity = np.dot(matrix.T.fillna(0), matrix.fillna(0)) / (norm[:, None] * norm[None, :])
    return similarity

similarity_matrix = cosine_similarity(rating_matrix_centered)

# Step 4: Filter Top 30 Similarities
# For each movie, keep only the top 30 most similar movies
def filter_top_k(similarity_matrix, k=30):
    filtered_matrix = np.zeros_like(similarity_matrix)
    for i in range(similarity_matrix.shape[0]):
        top_k_indices = np.argsort(-similarity_matrix[i, :])[:k]
        filtered_matrix[i, top_k_indices] = similarity_matrix[i, top_k_indices]
    return filtered_matrix

top_k_similarity_matrix = filter_top_k(similarity_matrix)

# Step 5: Define the Recommendation Function
def myIBCF(new_user_ratings, top_k_similarity_matrix, rating_matrix, k=30):
    """
    Input:
        - new_user_ratings: A 1D array of the new user's ratings (aligned to movie IDs).
        - top_k_similarity_matrix: Precomputed similarity matrix with top-k filtering.
        - rating_matrix: Original rating matrix.
        - k: Number of similar movies to consider.
    Output:
        - A DataFrame of top 10 recommended movies.
    """
    weighted_sum = np.nansum(top_k_similarity_matrix * new_user_ratings[:, None], axis=0)
    normalization = np.nansum((top_k_similarity_matrix != 0) * ~np.isnan(new_user_ratings[:, None]), axis=0)
    predicted_ratings = weighted_sum / normalization

    # Sort and select top 10 recommendations
    recommended_indices = np.argsort(-predicted_ratings)[:10]
    recommendations = pd.DataFrame({
        'MovieID': recommended_indices + 1,  # Adjust index to MovieID
        'PredictedRating': predicted_ratings[recommended_indices]
    })
    return recommendations

# Test Case: Use a sample new user
new_user_ratings = np.zeros(rating_matrix.shape[1])  # Initialize with zeros (no ratings yet)
new_user_ratings[1609] = 5  # User rated MovieID 1610 with 5
new_user_ratings[1754] = 4  # User rated MovieID 1755 with 4

# Get recommendations for the new user
recommendations = myIBCF(new_user_ratings, top_k_similarity_matrix, rating_matrix)
print(recommendations)


   MovieID  PredictedRating
0     1610         0.161290
1     3613         0.087706
2     1235         0.073509
3     1755         0.067797
4     2140         0.047921
5     2252         0.045485
6     2395         0.044205
7     2552         0.043269
8      239         0.037381
9      113         0.036582


In [10]:
user_u1181_ratings = rating_matrix.loc[1181].fillna(0).values  # Replace NaN with 0 for unrated movies
recommendations_u1181 = myIBCF(user_u1181_ratings, top_k_similarity_matrix, rating_matrix)
print(recommendations_u1181)


   MovieID  PredictedRating
0     2902         2.000000
1      146         2.000000
2     3628         2.000000
3     3444         1.609998
4     2421         1.572573
5     2651         1.430507
6     2087         1.301591
7      404         1.188671
8     3531         1.173817
9     2739         1.167772


## App