# 1. Data collection and Preprocessing

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split


In [1]:
import pandas as pd
path_to_movieslens = "ml-1m"
path_to_imdb = ""



# Load ratings.dat
ratings = pd.read_csv(path_to_movieslens+'/ratings.dat', sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

# Load users.dat
users = pd.read_csv(path_to_movieslens+'/users.dat', sep='::', engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])

# Load movies.dat
movies = pd.read_csv(path_to_movieslens+'/movies.dat', sep='::', engine='python',
                     names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')


title_basics = pd.read_csv(path_to_imdb+'title.basics.tsv', sep='\t', encoding='latin')
title_crew = pd.read_csv(path_to_imdb+'title.crew.tsv', sep='\t', encoding='latin')
title_principals = pd.read_csv(path_to_imdb+'title.principals.tsv', sep='\t', encoding='latin')
title_akas = pd.read_csv(path_to_imdb+'title.akas.tsv', sep='\t', encoding='latin')
name_basics = pd.read_csv(path_to_imdb+'name.basics.tsv', sep='\t', encoding='latin')

  title_basics = pd.read_csv(path_to_imdb+'title.basics.tsv', sep='\t', encoding='latin')


In [2]:
print(movies.head())

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [3]:
print("Ratings Data:")
print(ratings.head())
print("\nUsers Data:")
print(users.head())
print("\nMovies Data:")
print(movies.head())



Ratings Data:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Users Data:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455

Movies Data:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


# 2. Feature Engineering

In [4]:
directors = title_principals[title_principals['category'] == 'director'][['tconst', 'nconst']]
directors = directors.merge(name_basics[['nconst', 'primaryName']], on='nconst', how='left')
# Replace NaN values with empty strings
directors['primaryName'] = directors['primaryName'].fillna('')

# Group by tconst and join the primaryName values
directors = directors.groupby('tconst')['primaryName'].apply(lambda x: ', '.join(x)).reset_index()
directors.columns = ['tconst', 'Directors']

In [5]:
print("Directors Data:")
print(directors.head())

Directors Data:
      tconst                            Directors
0  tt0000001                 William K.L. Dickson
1  tt0000002                       Ãmile Reynaud
2  tt0000003                       Ãmile Reynaud
3  tt0000004                       Ãmile Reynaud
4  tt0000007  William K.L. Dickson, William Heise


In [6]:
# Extract cast
cast = title_principals[title_principals['category'].isin(['actor', 'actress'])][['tconst', 'nconst']]
cast = cast.merge(name_basics[['nconst', 'primaryName']], on='nconst', how='left')

# Replace NaN values with empty strings
cast['primaryName'] = cast['primaryName'].fillna('')

# Group by tconst and join the primaryName values
cast = cast.groupby('tconst')['primaryName'].apply(lambda x: ', '.join(x)).reset_index()
cast.columns = ['tconst', 'Cast']

In [7]:
print("Cast Data:")
print(cast.head())

Cast Data:
      tconst                                               Cast
0  tt0000005                           Charles Kayser, John Ott
1  tt0000007                   James J. Corbett, Peter Courtney
2  tt0000008                                           Fred Ott
3  tt0000009  Blanche Bayliss, William Courtenay, Chauncey D...
4  tt0000011                                            Grunato


In [8]:
# Normalize titles for merging
title_basics['primaryTitle'] = title_basics['primaryTitle'].str.lower().str.strip()
movies['Title'] = movies['Title'].str.lower().str.strip()

In [9]:
# Merge IMDb movies with directors and cast
imdb_movies = title_basics[['tconst', 'primaryTitle']]
imdb_movies = imdb_movies.merge(directors, on='tconst', how='left')
imdb_movies = imdb_movies.merge(cast, on='tconst', how='left')

In [10]:
print(movies.head())

   MovieID                               Title                        Genres
0        1                    toy story (1995)   Animation|Children's|Comedy
1        2                      jumanji (1995)  Adventure|Children's|Fantasy
2        3             grumpier old men (1995)                Comedy|Romance
3        4            waiting to exhale (1995)                  Comedy|Drama
4        5  father of the bride part ii (1995)                        Comedy


In [11]:
movies = pd.read_csv(path_to_movieslens+'/movies.dat', sep='::', engine='python',
                     names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')

movies = movies.merge(imdb_movies, left_on='Title', right_on='primaryTitle', how='left')

In [12]:
# Merge ratings with movies and users
movielens_data = ratings.merge(movies, on='MovieID', how='left')
movielens_data = movielens_data.merge(users, on='UserID', how='left')

In [13]:
# One-Hot Encode Genres
genres = movielens_data['Genres'].str.get_dummies('|')
movielens_data = pd.concat([movielens_data, genres], axis=1)

In [14]:
final_merged_data = movielens_data[['UserID', 'MovieID', 'Rating', 'Directors', 'Cast'] + genres.columns.tolist()]

In [15]:
# Remove duplicate entries by averaging the ratings for each (UserID, MovieID) pair
unique_ratings = final_merged_data.groupby(['UserID', 'MovieID'])['Rating'].mean().reset_index()

In [16]:

# Create the user-item interaction matrix
user_item_matrix = unique_ratings.pivot(index='UserID', columns='MovieID', values='Rating')

print(user_item_matrix)

MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1         5.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6036      NaN   NaN   NaN   2.0   NaN   3.0   NaN   NaN   NaN   NaN  ...   
6037      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6038      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6039      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6040      3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

MovieID  39

In [17]:
def combine_user_data(user_id_1, user_id_2, ratings, user_item_matrix, movielens_data, genres):
    # Extract ratings for both users
    user1_ratings = ratings[ratings['UserID'] == user_id_1]
    user2_ratings = ratings[ratings['UserID'] == user_id_2]
    
    # Calculate the average rating for each user
    user1_avg_rating = user1_ratings['Rating'].mean()
    user2_avg_rating = user2_ratings['Rating'].mean()
    
    # Merge ratings to combine them
    combined_ratings = pd.merge(user1_ratings, user2_ratings, on='MovieID', how='outer', suffixes=('_user1', '_user2'))
    
    # Fill NaNs with the user's average rating
    combined_ratings['Rating_user1'] = combined_ratings['Rating_user1'].fillna(user1_avg_rating)
    combined_ratings['Rating_user2'] = combined_ratings['Rating_user2'].fillna(user2_avg_rating)
    
    # Calculate combined ratings
    combined_ratings['Combined_Rating'] = (combined_ratings['Rating_user1'] + combined_ratings['Rating_user2']) / 2
    
    # Create a combined user-item interaction matrix row
    combined_user_row = combined_ratings[['MovieID', 'Combined_Rating']].set_index('MovieID').T
    
    # Concatenate the combined row to the original user-item matrix
    combined_user_row.index = ['combined']
    user_item_matrix_combined = pd.concat([user_item_matrix, combined_user_row])
    
    # Combine genre preferences
    user1_genres = movielens_data[movielens_data['UserID'] == user_id_1][genres.columns].sum()
    user2_genres = movielens_data[movielens_data['UserID'] == user_id_2][genres.columns].sum()
    combined_genres = (user1_genres + user2_genres) / 2
    
    # Combine directors and cast preferences (for simplicity, we'll just concatenate the lists and remove duplicates)
    user1_movies = movielens_data[movielens_data['UserID'] == user_id_1]
    user2_movies = movielens_data[movielens_data['UserID'] == user_id_2]
    
    combined_directors = pd.concat([user1_movies['Directors'], user2_movies['Directors']]).dropna().unique()
    combined_cast = pd.concat([user1_movies['Cast'], user2_movies['Cast']]).dropna().unique()
    
    combined_preferences = {
        'Genres': combined_genres,
        'Directors': combined_directors,
        'Cast': combined_cast
    }
    
    return user_item_matrix_combined, combined_preferences


In [18]:
# Example usage:
user_id_1 = 1  # Replace with actual user ID
user_id_2 = 2  # Replace with actual user ID

user_item_matrix_combined, combined_preferences = combine_user_data(user_id_1, user_id_2, ratings, user_item_matrix, movielens_data, genres)
print("Combined User-Item Matrix:")
print(user_item_matrix_combined)
print("\nCombined Preferences:")
print(combined_preferences)

Combined User-Item Matrix:
MovieID       1     2     3     4     5     6     7     8     9     10    ...  \
1         5.000000   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2              NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3              NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4              NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5              NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   
...            ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6037           NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6038           NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6039           NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6040      3.000000   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
combined  4.356589   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   



In [29]:
def svd_recommendation(user_item_matrix, k=50):
    # Fill NaNs with 0s for the SVD computation
    user_item_matrix_filled = user_item_matrix.fillna(0).values
    
    # Perform SVD
    U, sigma, Vt = np.linalg.svd(user_item_matrix_filled, full_matrices=False)
    
    # Select the top k features
    U_k = U[:, :k]
    sigma_k = np.diag(sigma[:k])
    Vt_k = Vt[:k, :]
    
    # Reconstruct the user-item matrix
    user_item_matrix_reconstructed = np.dot(np.dot(U_k, sigma_k), Vt_k)
    
    # Clip the values to be within the rating scale (1 to 5)
    user_item_matrix_reconstructed = np.clip(user_item_matrix_reconstructed, 1, 5)
    
    return user_item_matrix_reconstructed, U_k, sigma_k, Vt_k

In [30]:
# Apply SVD to the combined user-item matrix
user_item_matrix_combined, _ = combine_user_data(user_id_1, user_id_2, ratings, user_item_matrix, movielens_data, genres)
reconstructed_matrix, U_k, sigma_k, Vt_k = svd_recommendation(user_item_matrix_combined)

# Get the combined user index
combined_user_index = user_item_matrix_combined.index.get_loc('combined')

# Predicted ratings for the combined user
predicted_ratings_combined_user = reconstructed_matrix[combined_user_index, :]


In [31]:
predicted_ratings_df = pd.DataFrame(predicted_ratings_combined_user, index=user_item_matrix.columns, columns=['Predicted_Rating'])
predicted_ratings_df = predicted_ratings_df.reset_index().rename(columns={'index': 'MovieID'})

In [32]:
# Merge predicted ratings with movie metadata
predicted_ratings_with_metadata = predicted_ratings_df.merge(movies, on='MovieID', how='left')

# Sort movies by predicted rating
recommended_movies = predicted_ratings_with_metadata.sort_values(by='Predicted_Rating', ascending=False)

# Display top 10 recommended movies
top_10_recommended_movies = recommended_movies.head(10)
print("Top 10 Recommended Movies:")
print(top_10_recommended_movies[['Title', 'Predicted_Rating']])

Top 10 Recommended Movies:
                                       Title  Predicted_Rating
1848              Saving Private Ryan (1998)          5.000000
309         Shawshank Redemption, The (1994)          5.000000
106                        Braveheart (1995)          5.000000
0                           Toy Story (1995)          4.693533
513                  Schindler's List (1993)          4.547838
2651                  American Beauty (1999)          4.501239
466                     Jurassic Park (1993)          4.445061
1104  One Flew Over the Cuckoo's Nest (1975)          4.130427
2374                      Matrix, The (1999)          4.100194
443                     Fugitive, The (1993)          3.989186


In [33]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Create the user-item interaction matrix for training data
train_user_item_matrix = train_data.pivot(index='UserID', columns='MovieID', values='Rating')

# Apply SVD to the training user-item matrix
reconstructed_train_matrix, _, _, _ = svd_recommendation(train_user_item_matrix)

# Evaluate the model using RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [34]:

# Get the predicted ratings for the test set
test_user_item_matrix = test_data.pivot(index='UserID', columns='MovieID', values='Rating')
test_user_item_matrix_filled = test_user_item_matrix.fillna(0).values

# Get the common movies and users in both test and train sets
common_users = test_user_item_matrix.index.intersection(train_user_item_matrix.index)
common_movies = test_user_item_matrix.columns.intersection(train_user_item_matrix.columns)

In [35]:
# Get the predicted ratings for the common users and movies
predicted_ratings_test = reconstructed_train_matrix[np.ix_(
    train_user_item_matrix.index.get_indexer(common_users),
    train_user_item_matrix.columns.get_indexer(common_movies)
)]

# Get the actual ratings for the common users and movies
actual_ratings_test = test_user_item_matrix_filled[np.ix_(
    test_user_item_matrix.index.get_indexer(common_users),
    test_user_item_matrix.columns.get_indexer(common_movies)
)]

In [36]:

# Calculate RMSE
model_rmse = rmse(predicted_ratings_test, actual_ratings_test)
print(f"Model RMSE: {model_rmse}")

Model RMSE: 1.0840169879617894
