In [1]:
import os
import sys
import numpy as np
import pandas as pd
from numpy import random as npr

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [2]:
SEED = 2021
BOOKS_DATASET_PATH = "books_dataset_cleaned.csv"

## Load and transform restaurants data

In [3]:
real_dataset = pd.read_csv(BOOKS_DATASET_PATH)
real_dataset = real_dataset.drop(["Location"], axis=1)
real_dataset = real_dataset.sample(int(len(real_dataset) * 0.05))
real_dataset.head()

Unnamed: 0,user_id,Age,Country,ISBN,book_rating,rating_Avg,rating_sum,Count_All_Rate,Book_Title,Book_Author,Year_Of_Publication,Publisher
141584,243305,41.0,usa,1400031311,7,8.0,24,7,The O. Henry Prize Stories 2003 (Prize Stories...,LAURA FURMAN,2003.0,Anchor
218550,37950,35.311347,canada,874067952,5,5.0,5,4,The Monkey's Paw,W.W. Jacobs,1995.0,Worthington Publishing Company
378008,259260,56.0,usa,897162021,6,6.0,6,1,Dining In-New Orleans Cookbook (Dining in - Ne...,Phyllis Dennery,1988.0,Peanut Butter Publishing
228596,185233,31.0,usa,912333006,5,7.0,21,3,America's Historic Trails: With Tom Bodett,J. Kingston Pierce,1997.0,Pub Group West
79489,6313,35.0,mexico,8433914871,8,7.333333,22,3,Pulp,Charles Bukowski,1997.0,Anagrama


In [4]:
real_dataset["ISBN"] = real_dataset["ISBN"].astype('category').cat.codes

In [5]:
real_dataset["user_id"] = real_dataset["user_id"].astype('category').cat.codes

In [6]:
real_dataset["book_rating"] = real_dataset["book_rating"] / 10

In [7]:
real_dataset.dtypes

user_id                  int16
Age                    float64
Country                 object
ISBN                     int16
book_rating            float64
rating_Avg             float64
rating_sum               int64
Count_All_Rate           int64
Book_Title              object
Book_Author             object
Year_Of_Publication    float64
Publisher               object
dtype: object

## Create rating matrix

In [8]:
long_table = real_dataset[["user_id", "ISBN", "book_rating"]]
long_table["item_id"] = long_table["ISBN"]
long_table["rating"] = long_table["book_rating"]
long_table = long_table.drop(["book_rating", "ISBN"], axis=1)
long_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,user_id,item_id,rating
141584,8807,12579,0.7
218550,1393,11846,0.5
378008,9388,12232,0.6
228596,6772,12275,0.5
79489,213,14779,0.8
...,...,...,...
172512,4231,8322,0.7
314839,4668,6686,1.0
204278,1150,6186,0.7
144270,387,14257,0.8


In [9]:
rating_matrix = long_table.pivot(index="user_id", columns="item_id", values="rating")

In [10]:
rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,15020,15021,15022,15023,15024,15025,15026,15027,15028,15029
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10050,,,,,,,,,,,...,,,,,,,,,,
10051,,,,,,,,,,,...,,,,,,,,,,
10052,,,,,,,,,,,...,,,,,,,,,,
10053,,,,,,,,,,,...,,,,,,,,,,


In [11]:
rating_matrix = rating_matrix.fillna(0)
rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,15020,15021,15022,15023,15024,15025,15026,15027,15028,15029
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Evaluate

In [12]:
import os
import sys
import importlib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from modules import models, evaluator, trainers, utils
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)


<module 'modules.trainers' from '/Users/vldpro/Workspace/university/recsys/modules/trainers.py'>

In [13]:
import datetime

def evaluate_on_real_dataset(rating_matrix, test_size, sample_size, n_iterations=1):
    trainer_list = [
        trainers.KnnTrainTestExecutor(),
        trainers.SvdTrainTestExecutor(),
        trainers.AutoRecTrainTestExecutor(config={"epoch": 50}),
    ]
    errors = []
    for trainer in trainer_list:
        for iteration in range(n_iterations): 
            start_time = datetime.datetime.utcnow()
            error = trainer(rating_matrix, test_size=test_size, sample_size=sample_size)
            duration = datetime.datetime.utcnow() - start_time
            eval_result = {"model_name": trainer.model_name, "rmse": error[0], "mae": error[1], "duration": duration, "iteration": iteration}
            errors.append(eval_result)
            print(f"{eval_result}")
    return pd.DataFrame(errors)
        

In [None]:
eval_results = evaluate_on_real_dataset(rating_matrix, test_size=0.1, sample_size=0.1)

DeepCTR-PyTorch version 0.2.6 detected. Your version is 0.2.5.
Use `pip install -U deepctr-torch` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.6
