In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import time
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')


In [2]:
def initialization(x, mean=0., std=1.):
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [3]:
class FM(nn.Module):
    def __init__(self, n, k):
        super().__init__()
        self.bias = nn.Embedding(n, 1)
        self.embeddings = nn.Embedding(n, k)

        with torch.no_grad(): 
            initialization(self.embeddings.weight, std=0.01)
            initialization(self.bias.weight, std=0.01)

    def forward(self, X):

        dense_emb = self.embeddings(X)

        p1 = dense_emb.sum(dim=1).pow(2)
        p2 = dense_emb.pow(2).sum(dim=1)

        interaction_layer = 0.5 * (p1-p2).sum(1)
        linear_layer = self.bias(X).squeeze().sum(1)

        return linear_layer + interaction_layer

In [4]:
model = FM(10052,100)  
model.load_state_dict(torch.load('./model/model_v6.pth'))

<All keys matched successfully>

In [5]:
class MovieLens(Dataset):
    def __init__(self, feature, label=None):
        self.feature = feature
        self.label = label

    def __len__(self):
        return len(self.feature)

    def __getitem__(self, idx):
        x = self.feature[idx]
        y = self.label[idx]
        return x, y
    
def data_tensor(dataset):

    # scaler = MinMaxScaler()

    label = torch.tensor(dataset['rating'].values).float()
    dataset.drop(columns=['rating'], inplace=True)


    features = torch.tensor(dataset.values).int()
    # print(features,label)

    return MovieLens(features, label)

In [6]:
batch_size = 4096 # Needed to be modified!

test_set_path = {"initial":'./data/revised_test_set.csv',
                "full": '',
                "sampled":"./data/bpr_sampled_test_set.csv" }

test_set = pd.read_csv(test_set_path['initial'], header=0)

positive_test_set = test_set[test_set['rating']==1]

# test_set = data_tensor(test_set)
# test_dataloader = DataLoader(test_set, batch_size=batch_size, shuffle=True) 


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

K = 10
def full_ranking(model, test_dataloader):
    # ranked_user_movies = []
    model.eval()
    for feature, label in test_dataloader:
        feature, label = feature.to(device), label.to(device).unsqueeze(1)
        with torch.no_grad():
            movieId = feature[:,3] # to select the movieId
            # print("features are: ", feature)
            # print("moviedId are: ", movieId)
            # print("num of the movies: ", len(movieId))
            y_hat = model(feature.to(device))
            # print("score after predicting: ", y_hat)
            y  = [i.item() for i in y_hat]
            ranked_indices = np.argsort(y)[::-1]
            # print("ranked index: ",ranked_indices)

            ranked_movie = [movieId[i] for i in ranked_indices][:K]
            print("ranked movie id: ", ranked_movie)
            # print("========================")
            # ranked_user_movies.append(ranked_movie)  
    return ranked_movie

In [8]:
def feature_load(user, movies_id):
    user_features = pd.read_csv('./data/revised_user_features.csv', header=0)
    item_features = pd.read_csv('./data/revised_item_features.csv', header=0)
    
    user_feature = user_features[user_features['userId']==user]
    userId = int(user_feature['userId'])
    age = int(user_feature['age'])
    gender = int(user_feature['gender'])

    filtered_item = item_features[item_features['movieId'].isin(movies_id)]
    filtered_item.loc[:, 'userId'] = userId
    filtered_item.loc[:, 'age'] = age
    filtered_item.loc[:, 'gender'] = gender
    filtered_item.loc[:, 'rating'] = 1

    new_order = ['userId', 'age', 'gender', 'movieId', 'category','rating']
    filtered_item = filtered_item[new_order]

    return filtered_item
        

In [9]:
def recall(ranked_list, ground_truth):
    return len(set(ranked_list) & set(ground_truth)), len(ground_truth), len(set(ranked_list) & set(ground_truth))/len(ground_truth)

def ndcg(ranked_list, ground_truth):
    dcg = 0.0
    idcg = 0.0
    for idx, movie in enumerate(ranked_list):
        revised_idx = idx+1
        if movie in ground_truth:
            dcg += 1.0 / np.log2(revised_idx + 1)
        idcg += 1.0 / np.log2(revised_idx + 1)
    return dcg, idcg, dcg / idcg

def metric_model(total_numerator, total_denominator, total_dcg, total_idcg):
    recall = total_numerator/total_denominator
    ndcg = total_dcg / total_idcg

    print("The recall of this model is: ", recall)
    print("The ndcg of this model is: ", ndcg)


In [10]:

def evaluate(model, test_set):
    users_id = test_set['userId'].unique()
    movies_id = test_set['movieId'].unique()
    # total = {"total_numerator": 0,"total_denominator":0,"total_dcg":0, "total_idcg":0}
    total_numerator = 0
    total_denominator = 0
    total_dcg = 0
    total_idcg= 0

    for idx, user in enumerate(users_id):
        user_data  = feature_load(user, movies_id)
        # print(user_data)
        user_data = data_tensor(user_data)
        # print(user_data)
        # print("-----------")
        
        grouped_test_set = test_set.groupby('userId')
        
        test_dataloader = DataLoader(user_data, batch_size=batch_size) 
        ranked_list = full_ranking(model,test_dataloader)
        ranked_list =  [int(tensor.item()) for tensor in ranked_list]
        ground_truth = grouped_test_set.get_group(user)['movieId'].tolist()
        # metric_user(user, ranked_list, ground_truth,total)
        numerator, denominator, user_recall = recall(ranked_list, ground_truth)
        total_numerator+=numerator
        total_denominator+=denominator        
        dcg, idcg, user_ndcg = ndcg(ranked_list, ground_truth)
        total_dcg+=dcg        
        total_idcg+=idcg
        print("recall of user "+ str(user) + " is :" + str(user_recall))
        print("ndcg of user "+ str(user) + " is :" + str(user_ndcg))

        if idx % 50 == 49:
            print("Idx = ", idx)
            metric_model(total_numerator, total_denominator, total_dcg, total_idcg)
        
    metric_model(total_numerator, total_denominator, total_dcg, total_idcg)

    

In [11]:
evaluate(model, positive_test_set)

ranked movie id:  [tensor(6780, dtype=torch.int32), tensor(6357, dtype=torch.int32), tensor(7279, dtype=torch.int32), tensor(7273, dtype=torch.int32), tensor(6071, dtype=torch.int32), tensor(8957, dtype=torch.int32), tensor(6647, dtype=torch.int32), tensor(8610, dtype=torch.int32), tensor(6089, dtype=torch.int32), tensor(8897, dtype=torch.int32)]
recall of user 0 is :0.0
ndcg of user 0 is :0.0
ranked movie id:  [tensor(6780, dtype=torch.int32), tensor(6357, dtype=torch.int32), tensor(7279, dtype=torch.int32), tensor(7273, dtype=torch.int32), tensor(8610, dtype=torch.int32), tensor(6071, dtype=torch.int32), tensor(8957, dtype=torch.int32), tensor(9763, dtype=torch.int32), tensor(6040, dtype=torch.int32), tensor(6647, dtype=torch.int32)]
recall of user 1 is :0.0
ndcg of user 1 is :0.0
ranked movie id:  [tensor(6780, dtype=torch.int32), tensor(9819, dtype=torch.int32), tensor(6357, dtype=torch.int32), tensor(7279, dtype=torch.int32), tensor(9763, dtype=torch.int32), tensor(6590, dtype=tor