In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('../orig_data/movies.csv')
ratings = pd.read_csv('../orig_data/ratings.csv')

In [3]:
movies.drop('genres', inplace=True, axis=1)
ratings.drop('timestamp', inplace=True, axis=1)

In [4]:
df = pd.merge(movies, ratings, how='left', on='movieId')

In [5]:
df[df['userId'] == 1].head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1.0,4.0
325,3,Grumpier Old Men (1995),1.0,4.0
433,6,Heat (1995),1.0,4.0
2107,47,Seven (a.k.a. Se7en) (1995),1.0,5.0
2379,50,"Usual Suspects, The (1995)",1.0,5.0


In [6]:
df1 = pd.read_csv('../orig_data/fin_tst.csv')
df1.head()

Unnamed: 0,userId,target,samples,id
0,1,"[""Pete's Dragon (1977)-Adventure|Animation|Chi...",{'Desperado (1995)-Action|Romance|Western': 5....,0_US1
1,2,"['Shawshank Redemption, The (1994)-Crime|Drama...",{'Good Will Hunting (1997)-Drama|Romance': 4.5...,1_US2
2,3,['Galaxy of Terror (Quest) (1981)-Action|Horro...,{'Conan the Barbarian (1982)-Action|Adventure|...,2_US3
3,4,['Series 7: The Contenders (2001)-Action|Drama...,{'Maverick (1994)-Adventure|Comedy|Western': 4...,3_US4
4,5,"['Batman (1989)-Action|Crime|Thriller', 3.0]",{'Dances with Wolves (1990)-Adventure|Drama|We...,4_US5


In [7]:
import ast
df1['target'] = df1['target'].apply(ast.literal_eval)
df1['target'] = df1['target'].apply(lambda x: str(x[0]).split(')-')[0]+')')

In [8]:
df1.head()

Unnamed: 0,userId,target,samples,id
0,1,Pete's Dragon (1977),{'Desperado (1995)-Action|Romance|Western': 5....,0_US1
1,2,"Shawshank Redemption, The (1994)",{'Good Will Hunting (1997)-Drama|Romance': 4.5...,1_US2
2,3,Galaxy of Terror (Quest) (1981),{'Conan the Barbarian (1982)-Action|Adventure|...,2_US3
3,4,Series 7: The Contenders (2001),{'Maverick (1994)-Adventure|Comedy|Western': 4...,3_US4
4,5,Batman (1989),{'Dances with Wolves (1990)-Adventure|Drama|We...,4_US5


In [9]:
grouped = df1.groupby('userId')['target'].agg(list).reset_index()
grouped.head()

Unnamed: 0,userId,target
0,1,"[Pete's Dragon (1977), Teenage Mutant Ninja Tu..."
1,2,"[Shawshank Redemption, The (1994), The Drop (2..."
2,3,"[Galaxy of Terror (Quest) (1981), On Golden Po..."
3,4,"[Series 7: The Contenders (2001), Living in Ob..."
4,5,"[Batman (1989), Four Weddings and a Funeral (1..."


User based recommendation system

In [10]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

# Step 1: Create a User-Item Matrix
user_item_matrix = df.pivot_table(index='userId', columns='title', values='rating', fill_value=0)

# Step 2: Compute User Similarity (Cosine Similarity)
user_similarity = cosine_similarity(user_item_matrix)

# Convert to DataFrame for better readability
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
# user_similarity_df.head()

In [11]:
similar_users = user_similarity_df[1].sort_values(ascending=False).iloc[1:]
similar_users

userId
266.0    0.357408
313.0    0.351562
368.0    0.345127
57.0     0.345034
91.0     0.334727
           ...   
578.0    0.000000
506.0    0.000000
175.0    0.000000
556.0    0.000000
306.0    0.000000
Name: 1.0, Length: 609, dtype: float64

In [12]:
def predict_ratings(user_id, movie_list, user_item_matrix, user_similarity_df):
    # Get the most similar users to the input user (sorted in descending order)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).iloc[1:]  # Exclude the user itself (self-similarity)

    predicted_ratings = {}

    # For each movie in the list to predict
    for movie in movie_list:
        # Initialize a weighted sum and a sum of weights
        weighted_ratings = 0
        total_weight = 0

        # Iterate through similar users and accumulate weighted ratings
        for similar_user, similarity_score in similar_users.items():
            # If the similar user has rated the movie, add their weighted rating
            movie_rating = user_item_matrix.loc[similar_user, movie]
            if movie_rating > 0:
                weighted_ratings += similarity_score * movie_rating
                total_weight += similarity_score

        # If total_weight > 0, we calculate the predicted rating as the weighted average
        if total_weight > 0:
            predicted_rating = weighted_ratings / total_weight
        else:
            predicted_rating = 0  # If no similar user has rated this movie, we predict 0

        predicted_ratings[movie] = predicted_rating

    return predicted_ratings


In [13]:
user_id = 7
movie_list = grouped[grouped['userId'] == user_id]['target'].iloc[0]
print(movie_list)
predicted_ratings = predict_ratings(user_id, movie_list, user_item_matrix, user_similarity_df)
print(f"Predicted ratings for user {user_id}:")
for movie, rating in predicted_ratings.items():
    print(f"{movie}: {rating:.2f}")

['Planet of the Apes (1968)', 'Pearl Harbor (2001)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Aladdin (1992)', 'Aviator, The (2004)', 'Hostage (2005)', 'Psycho (1960)', 'X-Men: The Last Stand (2006)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Calendar Girls (2003)']
Predicted ratings for user 7:
Planet of the Apes (1968): 3.77
Pearl Harbor (2001): 2.99
Star Wars: Episode VI - Return of the Jedi (1983): 4.17
Aladdin (1992): 3.80
Aviator, The (2004): 3.57
Hostage (2005): 3.20
Psycho (1960): 4.03
X-Men: The Last Stand (2006): 3.37
Calendar Girls (2003): 2.39


In [14]:
predictions = []

for user_id in grouped['userId'].unique():
    # print(user_id)
    movie_list = grouped[grouped['userId'] == user_id]['target'].iloc[0]
    movie_list = list(set(movie_list))
    predicted_ratings = predict_ratings(user_id, movie_list, user_item_matrix, user_similarity_df)

    for movie, rating in predicted_ratings.items():
        predictions.append({
            'userId': user_id,
            'title': movie,
            'predicted_rating': rating
        })

# Convert the predictions list to a DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the DataFrame to a CSV file
predictions_df.to_csv('trad_user_ratings.csv', index=False)

# Print confirmation
print("Predicted ratings have been saved to 'trad_user_ratings.csv'.")

Predicted ratings have been saved to 'user_based_ratings.csv'.
