# A very simple Collaborative Filtering Using LightFM library for a anime recommendation system with the dataset from MyAnimeList

In [135]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from scipy.sparse import csr_matrix
from tqdm import tqdm

In [136]:
anime = pd.read_csv('anime.csv')
ratings = pd.read_csv('rating.csv')

In [137]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [138]:
anime.drop('rating', axis=1, inplace=True)

In [139]:
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


##### I deleted those entries with negative ratings

In [140]:
ratings = ratings[ratings['rating']!=-1]

In [173]:
ratings.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10


##### I make sure there's only ratings from 1 to 10


In [174]:
sorted(ratings['rating'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

#### I merge the two tables so i can make matrix later


In [143]:
dataRating = pd.merge(ratings, anime, on='anime_id')

In [144]:
dataRating.head()

Unnamed: 0,user_id,anime_id,rating,name,genre,type,episodes,members
0,1,8074,10,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,535892
1,3,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,535892
2,5,8074,2,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,535892
3,12,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,535892
4,14,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,535892


In [145]:
interactions = dataRating[['user_id', 'anime_id', 'rating']]
interactions = interactions.dropna()
interactions = interactions.drop_duplicates()
interactions.head()

Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,3,8074,6
2,5,8074,2
3,12,8074,6
4,14,8074,6


#### Let's create a dummy user to make predictions for it.. Let's say userid 13

#### I make sure the user doesn't exist in the table

In [175]:
dataRating[dataRating['user_id']==13]

Unnamed: 0,user_id,anime_id,rating,name,genre,type,episodes,members


#### I created a list with a bunch of animes so later on I can make the dummy user

In [146]:
movieList = ['Shingeki no kyojin', 'Boku No Hero Academia', 'Kimi no na wa', 'madoka']
index_list = []
for x in movieList:
    index = list(dataRating[dataRating['name'].str.contains(x, case=False)]['anime_id'])[0]
    index_list.append(index)
    

In [147]:
index_list

[16498, 31964, 32281, 9756]

#### checking if the those numbers are exactly the anime_id

In [148]:
dataRating[dataRating['anime_id'].isin(index_list)].name.unique()

array(['Shingeki no Kyojin', 'Mahou Shoujo Madoka★Magica',
       'Boku no Hero Academia', 'Kimi no Na wa.'], dtype=object)

#### I created a dictionary with the needed data. (dummy user)

In [150]:
new_user = {'1':{'user_id':13, 'anime_id': 16498, 'rating': 10},
             '2':{'user_id':13,'anime_id': 31964, 'rating': 6}, 
             '3':{'user_id':13,'anime_id': 32281, 'rating': 9},
             '4':{'user_id':13,'anime_id': 9756, 'rating': 8}}

In [151]:
pd.DataFrame(new_user).T

Unnamed: 0,user_id,anime_id,rating
1,13,16498,10
2,13,31964,6
3,13,32281,9
4,13,9756,8


In [152]:
interactions = interactions.append(pd.DataFrame(new_user).T, ignore_index=True)


  interactions = interactions.append(pd.DataFrame(new_user).T, ignore_index=True)


In [153]:
interactions

Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,3,8074,6
2,5,8074,2
3,12,8074,6
4,14,8074,6
...,...,...,...
6337237,73188,8749,6
6337238,13,16498,10
6337239,13,31964,6
6337240,13,32281,9


#### Now I create the interaction matrix.

In [154]:
interMatrix = pd.pivot_table(interactions, index='user_id', columns='anime_id', values='rating')

In [155]:
interMatrix.fillna(0, inplace=True)
interMatrix

anime_id,1,5,6,7,8,15,16,17,18,19,...,34238,34239,34240,34252,34283,34324,34325,34349,34367,34475
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,8.0,0.0,0.0,6.0,0.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73513,9.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73515,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [156]:
sparseMatrix = csr_matrix(interMatrix.values)

#### I created a dictionary to easily work with the indexes, because the user are in there

In [157]:
user_ids = list(interMatrix.index)
user_dict = {}
counter = 0 
for i in user_ids:
    user_dict[i] = counter
    counter += 1

In [158]:
user_dict[13]

10

##### let's make some recommendations!

In [159]:
model = LightFM(no_components=3, random_state=100, learning_rate=0.03, loss='warp')
model = model.fit(sparseMatrix, epochs=10)

In [160]:
user_x = user_dict[13]
n_users, n_items = interMatrix.shape
item_ids = np.arange(n_items)
preds = model.predict(user_ids=user_x, item_ids = item_ids)

In [161]:
scores = pd.Series(preds)
scores.index = interMatrix.columns
recomms_13 = list(pd.Series(scores.sort_values(ascending=False).index))[:15]

In [162]:
recomms_13

[11757,
 16498,
 1535,
 6547,
 10620,
 19815,
 22319,
 11111,
 9919,
 5114,
 9253,
 20507,
 4224,
 1575,
 8074]

In [163]:
result=pd.DataFrame(dataRating[dataRating['anime_id'].isin(recomms_613)].name.unique())

In [164]:
result

Unnamed: 0,0
0,Sword Art Online
1,Death Note
2,Fullmetal Alchemist: Brotherhood
3,Ao no Exorcist
4,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...
5,Another
6,Shingeki no Kyojin
7,No Game No Life
8,Tokyo Ghoul
9,Toradora!


In [166]:

interactions[(interactions.user_id==13) & (interactions.anime_id.isin(recomms_13))].anime_id.unique()

array([16498])

#### Now I created recommendations to all users! 

In [25]:
recomms = {
    'account_id': [],
    'recomms': []
}

n_users, n_items = interMatrix.shape
item_ids = np.arange(n_items)

for account in tqdm(dataRating.user_id.unique()):
    if account in list(interMatrix.index):
        acc_x = user_dict[account]

        preds = model.predict(user_ids=acc_x, item_ids = item_ids)

        scores = pd.Series(preds)
        scores.index = interMatrix.columns
        scores = list(pd.Series(scores.sort_values(ascending=False).index))[:20] 
        watched_contents = dataRating[dataRating.user_id == account].name.unique()
        scores = [x for x in scores if x not in watched_contents]
        scores = [x for x in scores if x not in dataRating]
        scores = scores[:20]

        recomms['account_id'].append(account)
        recomms['recomms'].append(scores)


100%|█████████████████████████████████████████████████████████████████████████████| 69600/69600 [12:50<00:00, 90.39it/s]


In [181]:
recomms = pd.DataFrame(recomms)
recomms = recomms.sort_values(by='user_id', ascending=True)


In [183]:
recomms.head()

Unnamed: 0,user_id,recomms
0,1,"[11757, 6547, 4224, 1535, 8074, 16498, 2167, 1..."
33936,2,"[16498, 11757, 1535, 6547, 10620, 4224, 9919, ..."
1,3,"[16498, 11757, 1535, 22319, 10620, 11111, 5114..."
2,5,"[11757, 6547, 4224, 11617, 8074, 10719, 16498,..."
19488,7,"[11757, 19815, 15583, 11617, 14813, 13759, 654..."
