<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [None]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

## Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib as plt

from pathlib import Path
from datetime import datetime as dt
from surprise import Dataset
from surprise import SVD, KNNWithMeans, NMF
from surprise import Dataset, accuracy, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

# customized imports
from scripts.knn import get_knn
from scripts.local_accuracy import NeighborhoodAccuracy
from scripts.local_utility import compute_neighborhood_ndcg
from scripts.group_list import group_list

## Main test variables

In [2]:
# dataset
DATASET_NAME = "movie-tweetings"
DATASET_PATH = "../datasets/" + DATASET_NAME
RATINGS_PATH = "../datasets/" + DATASET_NAME + "/ratings.csv"
MOVIES_PATH = "../datasets/" + DATASET_NAME + "/items.csv"

# main variable of removal from the target dataset (1% in this case)
n = 0.01

## Load Data

In [4]:
# load the data (paths of ratings and movies and then join into one full dataset)
ratings = pd.read_csv(RATINGS_PATH).rename({'movieId': 'item_id', 'userId': 'user_id'}, axis=1)
movies = pd.read_csv(MOVIES_PATH).rename({'movieId' : 'item_id', 'userId':'user_id'}, axis=1)

# convert the timestamps to dates and join with item details
ratings['date'] = ratings['timestamp'].apply(lambda x: dt.fromtimestamp(x).date())
dataset = ratings.set_index('item_id').join(movies.set_index('item_id'), how='left').reset_index()

# total ratings to be removed from the dataset based on the defined variable n
remove_n = round(len(dataset.index.to_list()) * n)
print("total ratings to be removed from the dataset in the below test: ", remove_n)

dataset

total ratings to be removed from the dataset in the below test:  9172


Unnamed: 0,item_id,user_id,rating,timestamp,date,title,genres
0,8,43197,5,1396981211,2014-04-08,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,71052,10,1412878553,2014-10-09,La sortie des usines Lumière (1895),Documentary|Short
2,12,70002,10,1439248579,2015-08-11,The Arrival of a Train (1896),Documentary|Short
3,25,37892,8,1488189899,2017-02-27,The Oxford and Cambridge University Boat Race ...,
4,91,5855,6,1385233195,2013-11-23,Le manoir du diable (1896),Short|Horror
...,...,...,...,...,...,...,...
917188,15085802,15522,7,1629045523,2021-08-15,Untold: Malice at the Palace (2021),
917189,15085802,17316,7,1628842720,2021-08-13,Untold: Malice at the Palace (2021),
917190,15085802,43296,8,1628730742,2021-08-12,Untold: Malice at the Palace (2021),
917191,15085802,63323,10,1628647596,2021-08-11,Untold: Malice at the Palace (2021),


### Optional - Run this if we want a fixed, pre-split train and test sets

In [5]:
fixed_data = True
os.makedirs(DATASET_PATH + '/local-impact/', exist_ok = True)

# test set portion (20%)
if(os.path.isfile(DATASET_PATH + '/local-impact/custom_trainset.csv')):
    print("custom split previously done, files loaded")
    dataset = pd.read_csv(DATASET_PATH + '/local-impact/custom_trainset.csv')
else:
    msk = np.random.rand(len(dataset)) < 0.75
    dataset[msk].to_csv(DATASET_PATH + '/local-impact/custom_trainset.csv', index=False)
    dataset[~msk].to_csv(DATASET_PATH + '/local-impact/custom_testset.csv', index=False)
    dataset = dataset[msk]

# total ratings to be removed from the dataset based on the defined variable n
remove_n = round(len(dataset.index.to_list()) * n)

dataset

Unnamed: 0,item_id,user_id,rating,timestamp,date,title,genres
0,8,43197,5,1396981211,2014-04-08,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,71052,10,1412878553,2014-10-09,La sortie des usines Lumière (1895),Documentary|Short
3,25,37892,8,1488189899,2017-02-27,The Oxford and Cambridge University Boat Race ...,
5,91,37503,5,1532347349,2018-07-23,Le manoir du diable (1896),Short|Horror
6,91,54841,7,1562928526,2019-07-12,Le manoir du diable (1896),Short|Horror
...,...,...,...,...,...,...,...
917188,15085802,15522,7,1629045523,2021-08-15,Untold: Malice at the Palace (2021),
917189,15085802,17316,7,1628842720,2021-08-13,Untold: Malice at the Palace (2021),
917190,15085802,43296,8,1628730742,2021-08-12,Untold: Malice at the Palace (2021),
917191,15085802,63323,10,1628647596,2021-08-11,Untold: Malice at the Palace (2021),


# Prepare the data and split into train/test

Optional section that has three different scenarios of rating removal from the dataset. 

We select only one scenario for each test

## Option 1 - 1% random ratings of N

In [6]:
drop_indices = np.random.choice(dataset.index, remove_n, replace=False)
dataset = dataset.drop(drop_indices)

print("new dataset size is: ", len(dataset))
print(" total removed ratings: ", remove_n)
print(" removed: ", round((remove_n/len(dataset)) * 100), "% of the dataset") 

new dataset size is:  680746
 total removed ratings:  6876
 removed:  1 % of the dataset


## Option 2 - 1% of N from a certain Genre
The goal is to remove 1% of N from a specific genre (one that has the most appearances in items)

There are two scenarios in this option: 
* Remove random ratings from the target genre 
* Eliminate only the high ratings of the items in the target genre (items that have ratings greater than 4/5)

In [None]:
grouped_genres = dataset.groupby('genres').genres.count().sort_values(ascending=False)
print("Genre with the highest number of stand-alone occurrences is: ", \
      grouped_genres.head(1).index.to_list()[0], "-", \
      grouped_genres.head(1).to_list()[0], "ratings")

genres_list = []
for genre_list in dataset.genres.to_list():
    genres_list.extend(genre_list.split('|'))

grouped_genres_2 = sorted(group_list(genres_list), key=lambda genre: genre[1], reverse=True)
print("Genre with the highest number of occurrences is: ", grouped_genres_2[0][0], "-", grouped_genres_2[0][1], "ratings")

### Case 1 - Random removal

In [None]:
target_genre = dataset[ dataset['genres'] == 'Comedy' ]
drop_indices = np.random.choice(target_genre.index, remove_n, replace=False)
dataset.drop(drop_indices, inplace = True)

print(" new dataset size is: ", len(dataset))
print(" total removed ratings: ", len(drop_indices))

### Case 2 - Targeted removal

In [None]:
target_genre = dataset[ (dataset['genres'] == 'Comedy') & (dataset['rating'] > 4) ]
drop_indices = np.random.choice(target_genre.index, remove_n, replace=False)
dataset.drop(drop_indices, inplace = True)

print(" new dataset size is: ", len(dataset))
print(" total removed ratings: ", len(drop_indices))

# Train the algorithm

In [11]:
data = Dataset.load_from_df(
                dataset[['user_id', 'item_id', 'rating']],
                Reader(rating_scale=(1, 10))
            )

if fixed_data:
    print("fixed data option was selected, loading held-out testset and creating trainset...")
    # load trainset from the "dataset" varible which we eliminated ratings from
    train_data = Dataset.load_from_df(dataset[['user_id', 'item_id', 'rating']], Reader(rating_scale=(1,10)))
    trainset = train_data.build_full_trainset()
    
    # load testset from the held-out testset in the beginning (optional block)
    test_data = pd.read_csv(DATASET_PATH + '/local-impact/custom_testset.csv')
    testset = list(test_data[['user_id', 'item_id', 'rating']].to_records(index=False))
    print("trainset and testset successfully created.")

else:
    trainset, testset = train_test_split(data, test_size=.20)

fixed data option was selected, loading held-out testset and creating trainset...
trainset and testset successfully created.


In [12]:
# load a recommender algorithm: SVD, NMF or KNNWithMeans algorithm
sim_options = {
    'name': 'pearson',
    'user_based': True
    }
algo = KNNWithMeans(sim_options = sim_options)

In [13]:
# train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

Computing the pearson similarity matrix...


MemoryError: Unable to allocate 29.2 GiB for an array with shape (62627, 62627) and data type float64

# Evaluation

Evaluate at the neighborhood level. The value of k defines the size of the neighborhood to be considered for the evaluation

## Neighborhood Evaluation Method

In [None]:
# initialize the neighborhood accuracy class with the predictions from our model (previous step)
local_eval = NeighborhoodAccuracy(predictions)

# get the neighbordhood of even user in the datasets where k defines the neighborhood size
neighbors = get_knn(dataset, algo, k=40)

# calculate the accuracy at the neighborhood level (user-level)
# local_eval.compute_neighborhood_mae(neighbors)
rmse = local_eval.compute_neighborhood_rmse(neighbors)
# compute_neighborhood_ndcg(predictions, neighbors)

rmse.to_csv('rmse-data.csv', index=False)

## Normal Evaluation Method

In [None]:
# compute MAE and RMSE
accuracy.mae(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)