In [20]:
import numpy as np
import pandas as pd
from surprise import Dataset, NormalPredictor, Reader, SVD, accuracy, Trainset

In [21]:
ratings_set_df = pd.read_csv("./data_movie_lens_100k/ratings_all_development_set.csv")
users_df = pd.read_csv("./data_movie_lens_100k/user_info.csv")
movies_df = pd.read_csv("./data_movie_lens_100k/movie_info.csv")

In [22]:
print(ratings_set_df)
print(users_df)
#print(movies_df)

       user_id  item_id  rating
0          772       36       3
1          471      228       5
2          641      401       4
3          312       98       4
4           58      504       5
...        ...      ...     ...
89987      415      813       4
89988      842      120       3
89989      574      505       2
89990      757      472       5
89991      503      204       3

[89992 rows x 3 columns]
     user_id  age  is_male  orig_user_id
0          0   24        1             1
1          1   53        0             2
2          2   23        1             3
3          3   24        1             4
4          4   33        0             5
..       ...  ...      ...           ...
938      938   26        0           939
939      939   32        1           940
940      940   20        1           941
941      941   48        0           942
942      942   22        1           943

[943 rows x 4 columns]


In [23]:
from sklearn.model_selection import train_test_split

ratings_features = ratings_set_df[["user_id", "item_id"]]
ratings = ratings_set_df["rating"]
X_train, X_test, y_train, y_test = train_test_split(ratings_features, ratings)

In [24]:
from train_valid_test_loader import load_train_valid_test_datasets

# Load the dataset in the same way as the main problem 
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
        load_train_valid_test_datasets()


def tuple_to_surprise_dataset(tupl):
    """
    This function convert a subset in the tuple form to a `surprise` dataset. 
    """
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

    return dataset

## Below we train an SVD model and get its vectors 

# train an SVD model using the training set
trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
test_trainset = tuple_to_surprise_dataset(test_tuple).build_full_trainset()
valid_trainset = tuple_to_surprise_dataset(valid_tuple).build_full_trainset()

In [25]:

def get_features_for_user(user_id):
    return np.array(users_df[users_df["user_id"] == user_id].iloc[0][["age", "is_male"]])

def get_features_for_item(item_id):
    return np.array(movies_df[movies_df["item_id"] == item_id].iloc[0][["release_year"]])

def get_rating(user_id, item_id, features_df: pd.DataFrame):
    return features_df[
        (features_df["user_id"] == user_id) &
            (features_df["item_id"] == item_id)
    ].iloc[0]["rating"]

def get_feature_vectors(data_tuple, svd: SVD, trainset: Trainset):
    user_vectors = svd.pu
    item_vectors = svd.qi
    
    features = np.zeros(
        (
            len(data_tuple[0]),
            user_vectors.shape[1] + item_vectors.shape[1] + len(["age", "is_male"]) + len(["release_year"])
        )
    )
    ratings = []
    for index in range(0, len(data_tuple[0])):
        user_id = data_tuple[0][index]
        item_id = data_tuple[1][index]
        u_i = np.zeros((user_vectors.shape[1]))
        if trainset.knows_user(user_id):
            u_i = user_vectors[trainset.to_inner_uid(user_id)]
        m_j = np.zeros((item_vectors.shape[1]))
        if trainset.knows_item(item_id):
            m_j = item_vectors[trainset.to_inner_iid(item_id)]
        user_features = get_features_for_user(user_id)
        item_features = get_features_for_item(item_id)
        feature_vector = np.concatenate(
            (
                u_i,
                m_j,
                user_features,
                item_features,
            ),
            axis=0
        )
        features[index] = feature_vector
        
        ratings.append(1 if data_tuple[2][index] > 4.5 else 0)
    
    return features, ratings


In [26]:

svd = SVD(
    n_factors = 10,
    n_epochs=50
)

svd.fit(trainset)


features_train, ratings_train = get_feature_vectors(train_tuple, svd, trainset)
features_test, ratings_test = get_feature_vectors(test_tuple, svd, test_trainset)
features_val, ratings_val = get_feature_vectors(valid_tuple, svd, valid_trainset)


In [27]:

# from sklearn.neural_network import MLPClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.preprocessing import StandardScaler

# mlp = MLPClassifier(
# )

# scaler = StandardScaler()
# features_train_scaled = scaler.fit_transform(features_train)

# param_grid = {
#     "solver": ["adam", "sgd"],
#     "activation": ["relu", "tanh"],
#     "hidden_layer_sizes": [(100,), (50,)],
# }

# gs = GridSearchCV(
#     estimator=mlp,
#     param_grid=param_grid,
#     refit=True,
#     return_train_score=True, 
    
# )

# gs.fit(features_train, ratings_train)



In [28]:
# from sklearn.metrics import RocCurveDisplay

 
# roc_disp = RocCurveDisplay.from_estimator(gs.best_estimator_, features_train, ratings_train)

In [29]:
# from sklearn.svm import SVC

# svm = SVC(probability=True)

# param_grid = {
#     'C': [0.1, 1, 10],
#     'gamma': [0.01, 0.1, 1],
#     'kernel': ['rbf']
# }

# gs = GridSearchCV(
#     estimator=svm,
#     param_grid=param_grid,
#     cv=3,
#     scoring='roc_auc',
#     refit=True
# )

# gs.fit(features_train, ratings_train)

# print(gs.best_estimator_)
# print(gs.best_score_)

# # best_svm = SVC(**best_params, probability=True)
# # best_svm.fit(features_train, ratings_train)



In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV


base_forest = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=16,
    min_samples_split=2,
    min_samples_leaf=1)

forest_hyperparameter_grid_by_name = dict(
    max_features=[3, 10, 33, 100, 333],
    max_depth=[16, 32, 64],
    min_samples_leaf=[1],
    n_estimators=[50, 100, 200],
    random_state=[101],
    )

forest_searcher = GridSearchCV(estimator=base_forest,
                                param_grid=forest_hyperparameter_grid_by_name,
                                scoring='roc_auc',
                                cv=3,
                                return_train_score=True,
                                refit=False) 

forest_searcher.fit(features_train, ratings_train)

best_estimator = forest_searcher.best_estimator_ 
best_Score = forest_searcher.best_score_



AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [38]:
print(forest_searcher.best_params_)
print(forest_searcher.best_score_)

{'max_depth': 64, 'max_features': 3, 'min_samples_leaf': 1, 'n_estimators': 200, 'random_state': 101}
0.8083570788158362


In [2]:
from sklearn.ensemble import RandomForestClassifier
new_forest = RandomForestClassifier(
    max_depth=64,
    max_features=3,
    min_samples_leaf=1,
    n_estimators = 200,
    random_state=101)



In [24]:
masked_test_data = pd.read_csv("data_movie_lens_100k/ratings_masked_leaderboard_set.csv")

In [27]:
user_ids = masked_test_data["user_id"].values
item_ids = masked_test_data["item_id"].values



KeyError: 0

943
