In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [4]:
path_to_data = r"../../data/ml-25m/ratings.csv"
fraction_to_sample = 0.1
chunk_size = 10000
unique_users = []

In [5]:
for chunk in pd.read_csv(path_to_data, chunksize=chunk_size):
    chunk.drop('timestamp', axis=1)
    unique_users.extend(chunk['userId'].unique())
    # Break the loop if we have collected enough user IDs
    if len(unique_users) >= fraction_to_sample * chunk_size:
        break

In [6]:

subset_users = np.random.choice(unique_users, size=int(0.1 * len(unique_users)), replace=False)
ratings = pd.read_csv(path_to_data, usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings = ratings[ratings['userId'].isin(subset_users)]


In [7]:
ratings.shape

(16597, 3)

In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16597 entries, 2963 to 139988
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   16597 non-null  int32  
 1   movieId  16597 non-null  int32  
 2   rating   16597 non-null  float32
dtypes: float32(1), int32(2)
memory usage: 324.2 KB


In [9]:
ratings.nunique()

userId      100
movieId    5756
rating       10
dtype: int64

In [10]:
user_movie_matrix = sp.csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))


In [11]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

In [12]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [13]:
trainset, testset = train_test_split(data, test_size=0.2)

In [14]:
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f3931df02d0>