In [16]:
import pandas as pd
import numpy as np
from surprise import Dataset, NormalPredictor, Reader, SVD, accuracy
from train_valid_test_loader import load_train_valid_test_datasets

In [17]:
ratings_set_df = pd.read_csv("./data_movie_lens_100k/ratings_all_development_set.csv")
users_df = pd.read_csv("./data_movie_lens_100k/user_info.csv")
movies_df = pd.read_csv("./data_movie_lens_100k/movie_info.csv")
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
        load_train_valid_test_datasets()

In [7]:
print(ratings_set_df)
print(users_df)
print(movies_df)

       user_id  item_id  rating
0          772       36       3
1          471      228       5
2          641      401       4
3          312       98       4
4           58      504       5
...        ...      ...     ...
89987      415      813       4
89988      842      120       3
89989      574      505       2
89990      757      472       5
89991      503      204       3

[89992 rows x 3 columns]
     user_id  age  is_male  orig_user_id
0          0   24        1             1
1          1   53        0             2
2          2   23        1             3
3          3   24        1             4
4          4   33        0             5
..       ...  ...      ...           ...
938      938   26        0           939
939      939   32        1           940
940      940   20        1           941
941      941   48        0           942
942      942   22        1           943

[943 rows x 4 columns]
      item_id                                      title  release_year  \


In [18]:

def tuple_to_surprise_dataset(tupl):
    """
    This function convert a subset in the tuple form to a `surprise` dataset. 
    """
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

    return dataset

In [19]:
surprise_train = tuple_to_surprise_dataset(train_tuple).build_full_trainset()

In [46]:
svd = SVD(
    n_factors = 10,
    n_epochs = 50,
)

svd.fit(surprise_train)
user_ids = [surprise_train.to_inner_uid(uid) for uid in train_tuple[0]]
user_factors = svd.pu[user_ids]
item_ids = [surprise_train.to_inner_iid(iid) for iid in train_tuple[1]]
item_factors = svd.qi[item_ids]

In [50]:
print("users_df shape:", users_df.shape)
print("movies_df shape:", movies_df.shape)
print("item_factors shape:", item_factors.shape)
print("user_factors shape:", user_factors.shape)
print("item id train tuple length:", len(train_tuple[1]))
print("unique items in train tuple:", len(set(train_tuple[1])))

users_df shape: (943, 4)
movies_df shape: (1681, 4)
item_factors shape: (70000, 10)
user_factors shape: (70000, 10)
item id train tuple length: 70000
unique items in train tuple: 1626


In [43]:
user_vectors = np.hstack((svd.pu, users_df))
movies_vectors = np.hstack((item_factors, movies_df))

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 70000 and the array at index 1 has size 1681

In [4]:
combined_vectors = np.hstack([users_df, movies_df])

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 943 and the array at index 1 has size 1681