In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./anime/anime.csv')
print(df.shape)
df.head()

(12294, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
df2 = pd.read_csv('./anime/rating.csv')
print(df2.shape)
df2.head()

(7813737, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


1. What kind of preprocessing is necessary for the ratings dataset?

> We remove rows with ratings of -1 because the user hasn't given rating for the anime which would have an effect on the results. 

In [4]:
rated_animes = df2[df2['rating']!=-1]
rated_animes

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


2. How do the recommendation algorithms (e.g. KNN and SVD) perform with a data set of this magnitude? Do you encounter hardware limitations? If yes, how can you circumvent some of the limitations to be able to carry on with the experiment?

> **KNNBasic**: Initially we had memory issues but after closing other programs we were able to train using the full dataset. According to task manager python takes ~3.5 GB of RAM while training the dataset which may cause issues if computer lacks enought RAM.

> **SVD**: Used less RAM than KNNBasic at ~2.6 GB. Training took five times longer.

In [7]:
from surprise import Reader, KNNBasic, Dataset, SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1,10))
#algo = KNNBasic(sim_options={'user_based': False})
algo = SVD()

In [8]:
%%time
data = Dataset.load_from_df(rated_animes, reader)

Wall time: 11.2 s


In [9]:
%%time
trainset = data.build_full_trainset()

In [10]:
%%time
algo.fit(trainset)

Wall time: 11min 27s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ae37abac88>

3. Can you combine the information in the two files in a meaningful way to have the recommender display the titles of the recommended movies?

> We print top 3 most recomended never before seen animes for 10 random users.

In [11]:
%%time
from random import choices

all_animes = df['anime_id']

for user_id in choices(trainset.all_users(), k=10):
    watched_animes = df2[df2['user_id']==user_id]['anime_id']
    not_watched_animes = list(set(all_animes) - set(watched_animes))
    
    ratings = {}
    for anime_id in not_watched_animes:
        ratings[anime_id] = algo.predict(user_id, anime_id)
        
    print(f'Anime recomendations for user {user_id}:')
    for key, value in list(sorted(ratings.items(), key=lambda item: item[1][3], reverse=1))[:3]:    
        name = df[df['anime_id'] == key]['name'].values[0]
        print(f'\tEstimated rating: {round(value[3]):.0f}, Anime: {name}')

Anime recomendations for user 31539:
	Estimated rating: 10, Anime: Ginga Eiyuu Densetsu
	Estimated rating: 10, Anime: Aria The Origination
	Estimated rating: 9, Anime: Shouwa Genroku Rakugo Shinjuu
Anime recomendations for user 66470:
	Estimated rating: 10, Anime: Mahou Shoujo Madoka★Magica
	Estimated rating: 10, Anime: Steins;Gate
	Estimated rating: 10, Anime: Shigatsu wa Kimi no Uso
Anime recomendations for user 58589:
	Estimated rating: 10, Anime: Clannad: After Story
	Estimated rating: 9, Anime: Gintama°
	Estimated rating: 9, Anime: Gintama
Anime recomendations for user 8025:
	Estimated rating: 10, Anime: One Punch Man
	Estimated rating: 10, Anime: Boku dake ga Inai Machi
	Estimated rating: 10, Anime: Kimi no Na wa.
Anime recomendations for user 42810:
	Estimated rating: 10, Anime: Gintama°
	Estimated rating: 10, Anime: Gintama&#039;
	Estimated rating: 10, Anime: Gintama&#039;: Enchousen
Anime recomendations for user 11487:
	Estimated rating: 10, Anime: xxxHOLiC
	Estimated rating: 