In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [2]:
path_to_data = r"../../data/ml-25m/ratings.csv"
fraction_to_sample = 0.1
chunk_size = 10000
unique_users = []

In [3]:
for chunk in pd.read_csv(path_to_data, chunksize=chunk_size):
    chunk.drop('timestamp', axis=1)
    unique_users.extend(chunk['userId'].unique())
    # Break the loop if we have collected enough user IDs
    if len(unique_users) >= fraction_to_sample * chunk_size:
        break

In [4]:

subset_users = np.random.choice(unique_users, size=int(0.1 * len(unique_users)), replace=False)
ratings = pd.read_csv(path_to_data, usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings = ratings[ratings['userId'].isin(subset_users)]


In [5]:
ratings.shape

(20458, 3)

In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20458 entries, 1152 to 139920
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   20458 non-null  int32  
 1   movieId  20458 non-null  int32  
 2   rating   20458 non-null  float32
dtypes: float32(1), int32(2)
memory usage: 399.6 KB


In [7]:
ratings.nunique()

userId       99
movieId    6560
rating       10
dtype: int64

In [8]:
user_movie_matrix = sp.csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))


In [9]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

In [10]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [11]:
trainset, testset = train_test_split(data, test_size=0.2)

In [12]:
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f67817de850>

In [13]:
target_user_id = ratings['userId'].sample().iloc[0]  # Randomly select a user
target_user_movies = ratings[ratings['userId'] == target_user_id]['movieId']
unseen_movies = ratings[~ratings['movieId'].isin(target_user_movies)]['movieId']

In [17]:
unseen_movies.head()

1153     19
1154     32
1159     88
1160     95
1162    113
Name: movieId, dtype: int32

In [18]:
predictions = []
for movie_id in unseen_movies:
    prediction = model.predict(target_user_id, movie_id)
    predictions.append((movie_id, prediction.est))

In [27]:
predictions

[(19, 2.8608515364836493),
 (32, 4.209369036208582),
 (88, 3.302442808751949),
 (95, 3.281784644720325),
 (113, 3.0),
 (122, 2.330573984241989),
 (147, 3.74626496574684),
 (150, 4.062328078481422),
 (153, 2.9055205503801784),
 (170, 2.8707974546718367),
 (191, 2.0),
 (216, 3.3489214832053102),
 (218, 2.673067238910552),
 (219, 3.0),
 (224, 3.893649977335716),
 (238, 2.0),
 (248, 2.4924550048318186),
 (253, 3.4207788236229373),
 (260, 4.1219562504164555),
 (267, 2.4766933315442077),
 (277, 3.3040166664174193),
 (292, 3.4479804689694813),
 (312, 2.0),
 (340, 4.494422016263445),
 (344, 3.040359843901658),
 (350, 3.727454637435944),
 (364, 3.8368403088518686),
 (372, 3.362641000035108),
 (416, 2.6604551241042915),
 (432, 3.0412020444400008),
 (440, 3.333906374742646),
 (441, 3.5973455251135866),
 (477, 4.00712753725699),
 (487, 3.0),
 (491, 3.0),
 (524, 3.3792550738307248),
 (542, 3.507539514817302),
 (553, 4.073828594218329),
 (588, 3.8095642020257308),
 (590, 3.6752519050058177),
 (592, 

In [28]:
recommendations = sorted(predictions, key=lambda x: x[1], reverse=False)[:10]


In [22]:
movies = pd.read_csv(r'../../data/ml-25m/movies.csv')

In [24]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [25]:
movie_id_to_title = dict(zip(movies['movieId'], movies['title']))

In [29]:
for movie_id, score in recommendations:
    movie_name = movie_id_to_title.get(movie_id, "Unknown")
    print(f"Movie: {movie_name}, Score: {score}")

Movie: Death Wish 2 (1982), Score: 0.5
Movie: High School Musical 3: Senior Year (2008), Score: 0.5
Movie: Henry Poole is Here (2008), Score: 0.5
Movie: Scary Movie 5 (Scary MoVie) (2013), Score: 0.5
Movie: Tarantella (1995), Score: 0.5
Movie: Battlefield Earth (2000), Score: 0.5
Movie: Next Best Thing, The (2000), Score: 0.5
Movie: Battlefield Earth (2000), Score: 0.5
Movie: Cat in the Hat, The (2003), Score: 0.5
Movie: Dust Factory, The (2004), Score: 0.5
