In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import pickle
from scipy.sparse import csr_matrix, save_npz, load_npz

In [3]:
def build_sparse_matrix(df, unique_users, unique_items, user_name_col= 'userId', item_name_col='movieId'):
  user_to_idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
  item_to_idx = {item_id: idx for idx, item_id in enumerate(unique_items)}
  df['user_idx_sparse']= df[user_name_col].map(user_to_idx)
  df['item_idx_sparse']= df[item_name_col].map(item_to_idx)
  user_item_matrix_sparse = csr_matrix(
      (df['rating'],(df['user_idx_sparse'],df['item_idx_sparse'])),
      (len(unique_users), len(unique_items))
  )
  return user_item_matrix_sparse, user_to_idx, item_to_idx

In [10]:
rating_df_full = pd.read_csv('/content/drive/MyDrive/Project_AIL303_GROUP2/Final_dataset_in_here!!!!!/100k_final_dataset.csv')[:30000]
train_df, test_df = train_test_split(rating_df_full, test_size=0.2, random_state=42)
unique_users= sorted(train_df['userId'].unique())
unique_items= sorted(train_df['movieId'].unique())
userId_movieId_sparse, user_to_idx, movie_to_idx = build_sparse_matrix(train_df, unique_users, unique_items)
user_similarity_sparse_matrix = cosine_similarity(userId_movieId_sparse, dense_output=False) # dense_output=False để giữ ở dạng thưa nếu cần

# Chuyển ma trận tương đồng về DataFrame với các userId gốc
user_similarity_df = pd.DataFrame(user_similarity_sparse_matrix.toarray(),
                                  index=unique_users,
                                  columns=unique_users)

print("Tạo sparse matrix và ma trận tương đồng hoàn tất!")
print("Kích thước ma trận thưa (người dùng x phim):", userId_movieId_sparse.shape)
print("Kích thước ma trận tương đồng (người dùng x người dùng):", user_similarity_df.shape)

Tạo sparse matrix và ma trận tương đồng hoàn tất!
Kích thước ma trận thưa (người dùng x phim): (1531, 8903)
Kích thước ma trận tương đồng (người dùng x người dùng): (1531, 1531)


In [11]:
user_similarity_df

Unnamed: 0,22,34,55,58,81,101,109,132,144,174,...,19239,19247,19264,19283,19314,19315,19347,19358,19366,19387
22,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
34,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
55,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
58,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
81,0.0,0.0,0.0,0.0,1.0,0.0,0.091026,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19315,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.0
19347,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
19358,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1.0,0.0,0.0
19366,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0


In [12]:
print(userId_movieId_sparse)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 24000 stored elements and shape (1531, 8903)>
  Coords	Values
  (0, 2798)	3.5
  (0, 8531)	3.0
  (1, 867)	4.0
  (1, 2619)	4.0
  (2, 2012)	4.0
  (3, 2470)	0.5
  (3, 3826)	3.5
  (3, 4742)	5.0
  (3, 5876)	3.5
  (3, 8355)	4.5
  (4, 27)	4.0
  (4, 1598)	5.0
  (4, 2063)	5.0
  (4, 3651)	5.0
  (5, 670)	4.0
  (6, 25)	4.5
  (6, 27)	5.0
  (6, 244)	5.0
  (6, 368)	3.5
  (6, 370)	4.5
  (6, 421)	4.5
  (6, 452)	4.5
  (6, 654)	4.5
  (6, 821)	4.5
  (6, 1159)	4.0
  :	:
  (1530, 2975)	4.0
  (1530, 3012)	4.0
  (1530, 3053)	3.5
  (1530, 3077)	3.0
  (1530, 3115)	3.5
  (1530, 3193)	3.5
  (1530, 3207)	3.5
  (1530, 3239)	4.5
  (1530, 3244)	3.5
  (1530, 3292)	4.0
  (1530, 3356)	4.5
  (1530, 3415)	3.5
  (1530, 3468)	4.5
  (1530, 3545)	4.0
  (1530, 3566)	4.0
  (1530, 3601)	4.0
  (1530, 3606)	3.0
  (1530, 3684)	3.5
  (1530, 3767)	3.5
  (1530, 3779)	4.0
  (1530, 3884)	4.5
  (1530, 3907)	4.0
  (1530, 3927)	4.0
  (1530, 4007)	4.0
  (1530, 4037)	4.0


In [13]:
rating_df_full[['userId', 'movieId']].duplicated().sum()

np.int64(0)

In [14]:
def get_similar_vector(user_sim_df, userId):
    if userId not in user_sim_df.index:
        raise ValueError(f"User ID {userId} không tồn tại trong ma trận tương đồng.")
    sim_vector = user_sim_df.loc[userId]
    return sim_vector.drop(userId, errors='ignore')

def predict_rating_sparse(movieid_to_predict, k, sim_vector, userId_movieId_sparse, user_to_idx, movie_to_idx):
    """
    Dự đoán rating sử dụng sparse matrix.
    """

    top_k_neighbors_sim = sim_vector.nlargest(k)
    top_k_neighbors_ids = top_k_neighbors_sim.index
    if movieid_to_predict not in movie_to_idx:
        return userId_movieId_sparse.data.mean()

    movie_idx = movie_to_idx[movieid_to_predict]
    neighbor_user_idx = [user_to_idx[uid] for uid in top_k_neighbors_ids]
    neighbor_ratings = userId_movieId_sparse[neighbor_user_idx, movie_idx].toarray().ravel()
    valid_mask = neighbor_ratings > 0
    valid_ratings = neighbor_ratings[valid_mask]

    if len(valid_ratings) == 0:
        return userId_movieId_sparse.data.mean()

    valid_sims = top_k_neighbors_sim.iloc[valid_mask]
    predicted_rating = np.dot(valid_sims, valid_ratings) / valid_sims.sum()
    return predicted_rating

def eval_sparse(test_df, user_sim_df, userId_movieId_sparse, k, user_to_idx, movie_to_idx):
    """
    Đánh giá mô hình trên tập test sử dụng sparse matrix.
    """
    predictions = []
    ground_truth = []

    train_users = set(user_to_idx.keys())

    for _, row in test_df.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        true_rating = row['rating']
        if user_id in train_users:
            sim_vector = get_similar_vector(user_sim_df, user_id)
            pred_rating = predict_rating_sparse(movie_id, k, sim_vector, userId_movieId_sparse, user_to_idx, movie_to_idx)
            predictions.append(pred_rating)
            ground_truth.append(true_rating)
    return np.sqrt(mean_squared_error(ground_truth, predictions))

In [17]:
rmse_sparse = eval_sparse(test_df, user_similarity_df, userId_movieId_sparse, 5, user_to_idx, movie_to_idx)
print(f"RMSE of testset: {rmse_sparse}")

RMSE of testset: 1.1422990120273757


Create userId_movieId_matrix for new_user from web


In [19]:
full_dt = pd.read_csv('/content/drive/MyDrive/Project_AIL303_GROUP2/ml-32m/final_dataset_with_poster.csv')
unique_users= sorted(full_dt['userId'].unique())
unique_items= sorted(full_dt['movieId'].unique())
userId_movieId_sparse, user_to_idx, movie_to_idx = build_sparse_matrix(full_dt, unique_users, unique_items)
with open("user_to_idx.pkl", "wb") as f:
    pickle.dump(user_to_idx, f)

with open("movie_to_idx.pkl", "wb") as f:
    pickle.dump(movie_to_idx, f)
save_npz("user_item_sparse_matrix.npz", userId_movieId_sparse)
