In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, NormalPredictor, Reader, SVD, accuracy, Trainset



In [3]:
ratings_set_df = pd.read_csv("./data_movie_lens_100k/ratings_all_development_set.csv")
users_df = pd.read_csv("./data_movie_lens_100k/user_info.csv")
movies_df = pd.read_csv("./data_movie_lens_100k/movie_info.csv")
masked_test_df = pd.read_csv("data_movie_lens_100k/ratings_masked_leaderboard_set.csv")

In [4]:
from sklearn.model_selection import train_test_split

ratings_features = ratings_set_df[["user_id", "item_id"]]
ratings = ratings_set_df["rating"]
X_train, X_test, y_train, y_test = train_test_split(ratings_features, ratings)

In [5]:
from train_valid_test_loader import load_train_valid_test_datasets

# Load the dataset in the same way as the main problem 
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
        load_train_valid_test_datasets()


def tuple_to_surprise_dataset(tupl):
    """
    This function convert a subset in the tuple form to a `surprise` dataset. 
    """
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

    return dataset

## Below we train an SVD model and get its vectors 

# train an SVD model using the training set
trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
test_trainset = tuple_to_surprise_dataset(test_tuple).build_full_trainset()
valid_trainset = tuple_to_surprise_dataset(valid_tuple).build_full_trainset()

In [6]:

def get_features_for_user(user_id):
    return np.array(users_df[users_df["user_id"] == user_id].iloc[0][["age", "is_male"]])

def get_features_for_item(item_id):
    return np.array(movies_df[movies_df["item_id"] == item_id].iloc[0][["release_year"]])

def get_rating(user_id, item_id, features_df: pd.DataFrame):
    return features_df[
        (features_df["user_id"] == user_id) &
            (features_df["item_id"] == item_id)
    ].iloc[0]["rating"]

def get_feature_vectors(data_tuple, svd: SVD, trainset: Trainset):
    user_vectors = svd.pu
    item_vectors = svd.qi
    
    features = np.zeros(
        (
            len(data_tuple[0]),
            user_vectors.shape[1] + item_vectors.shape[1] + len(["age", "is_male"]) + len(["release_year"])
        )
    )
    ratings = []
    for index in range(0, len(data_tuple[0])):
        user_id = data_tuple[0][index]
        item_id = data_tuple[1][index]
        u_i = np.zeros((user_vectors.shape[1]))
        if trainset.knows_user(user_id):
            u_i = user_vectors[trainset.to_inner_uid(user_id)]
        m_j = np.zeros((item_vectors.shape[1]))
        if trainset.knows_item(item_id):
            m_j = item_vectors[trainset.to_inner_iid(item_id)]
        user_features = get_features_for_user(user_id)
        item_features = get_features_for_item(item_id)
        feature_vector = np.concatenate(
            (
                u_i,
                m_j,
                user_features,
                item_features,
            ),
            axis=0
        )
        features[index] = feature_vector
        
        ratings.append(1 if data_tuple[2][index] > 4.5 else 0)
    
    return features, ratings


In [23]:
print(features)

[array([ 3.84378515e-01, -2.21529416e-01, -4.96168104e-02,  5.04572098e-01,
        1.49852513e-01,  2.07844971e-01, -5.50292888e-01,  3.40940441e-03,
       -1.63083755e-01,  1.99219073e-02, -2.53394544e-01, -2.60270931e-01,
        2.57444678e-01, -1.97167757e-01, -4.63887770e-02, -3.31551019e-01,
        5.88415838e-01,  3.03273451e-01, -4.07789306e-01, -1.80403094e-01,
        4.00000000e+01,  0.00000000e+00,  1.97000000e+03]), array([-1.66552871e-01,  5.22586017e-02,  1.96975221e-01, -7.51198418e-02,
       -5.01271334e-01, -4.03720129e-01, -2.16197326e-01,  2.08825154e-02,
        2.42598207e-01,  4.40865486e-01,  1.15191812e-01,  7.75981787e-02,
       -9.27767360e-02, -1.87501144e-01,  3.69192791e-01, -3.27608550e-01,
        1.22131633e-01,  1.67425335e-01, -5.21763625e-01,  3.35876907e-01,
        3.20000000e+01,  0.00000000e+00,  1.98000000e+03]), array([-3.65721914e-01,  9.48441856e-02, -1.52001137e-01, -4.58860777e-01,
       -1.41034089e-01,  1.07545542e-01,  2.55807956e-

In [25]:
svd = SVD(
    n_factors = 10,
    n_epochs=50
)

svd.fit(trainset)


features_train, ratings_train = get_feature_vectors(train_tuple, svd, trainset)
features_test, ratings_test = get_feature_vectors(test_tuple, svd, test_trainset)
features_val, ratings_val = get_feature_vectors(valid_tuple, svd, valid_trainset)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 2 has 2 dimension(s)

In [26]:
user_ids = masked_test_df["user_id"].values
item_ids = masked_test_df["item_id"].values

def get_features_for_user(user_id):
    user_info = users_df[users_df["user_id"] == user_id]
    return user_info[["age", "is_male"]].values


def get_features_for_item(item_id):
    item_info = movies_df[movies_df["item_id"] == item_id]
    return item_info[["release_year"]].values

features = []
user_vectors = svd.pu
item_vectors = svd.qi

for i in range(len(user_ids)):
    user_id = user_ids[i]
    item_id = item_ids[i]

    try:
        u_i = user_vectors[user_id]
    except IndexError:
        u_i = np.zeros_like(user_vectors[0])
    try:
        m_j = item_vectors[item_id]
    except IndexError:
        m_j = np.zeros_like(item_vectors[0])
    
    user_features = get_features_for_user(user_id)
    item_features = get_features_for_item(item_id)
    
    feature_vector = np.concatenate(
            (
                u_i,
                m_j,
                user_features.flatten(),
                item_features.flatten(),
            )
        )
    features.append(feature_vector)
    

    
    

In [27]:
print(features)

[array([-4.23360619e-02,  2.73976628e-01, -1.70040177e-01,  4.56341514e-01,
        1.35939603e-01,  7.60973998e-03,  4.22706538e-02, -1.37805960e-01,
       -7.42775662e-02, -1.87251307e-01,  3.75033229e-02, -5.09822719e-01,
       -2.62551837e-01, -9.36881367e-02, -1.58689136e-02,  6.12925471e-01,
        7.41448371e-02,  2.81304563e-01,  5.94760290e-02,  1.26968379e-01,
        4.00000000e+01,  0.00000000e+00,  1.97000000e+03]), array([-1.39983628e-02,  2.65207478e-01,  3.49383024e-02, -3.03554621e-02,
       -2.11780999e-01, -2.28319814e-01,  5.65347416e-01, -4.54469632e-02,
       -1.14541272e-01, -2.86489396e-01,  5.25365467e-01,  1.57135043e-01,
       -1.41249761e-02, -8.97821110e-02, -8.44049080e-02,  1.83191385e-01,
       -5.08880630e-02, -5.86230725e-02,  1.09659166e-02,  1.49796177e-02,
        3.20000000e+01,  0.00000000e+00,  1.98000000e+03]), array([-3.77414460e-01,  4.60497864e-01, -2.60585801e-01, -2.20882422e-01,
        3.64065234e-01,  1.58378442e-02, -2.08247143e-

In [30]:
from sklearn.ensemble import RandomForestClassifier
new_forest = RandomForestClassifier(
    max_depth=64,
    max_features=3,
    min_samples_leaf=1,
    n_estimators = 200,
    random_state=101)

new_forest.fit(features_train, ratings_train) 

In [51]:
predict_probas = new_forest.predict_proba(features)[:, 1]
print(predict_probas)
#print(len(predict_ratings))


[0.23  0.265 0.07  ... 0.085 0.235 0.17 ]


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.