This will be similar to User Based Collaborative Filtering.

In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

# Read the data

In [2]:
anime = pd.read_csv("anime.csv")
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
# only select tv show and movie
print(anime.shape)
anime = anime[(anime['type'] == 'TV') | (anime['type'] == 'Movie')]
print(anime.shape)

(12294, 7)
(6135, 7)


In [4]:
# only select famous anime, 75% percentile
m = anime['members'].quantile(0.75)
anime = anime[(anime['members'] >= m)]
anime.shape

(1534, 7)

In [5]:
rating = pd.read_csv("rating.csv")
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
rating.shape

(7813737, 3)

In [7]:
# reduce the rows of rating, because my memory is not enough
rating = rating[rating['user_id'] <= 10000]

In [8]:
rating.shape

(1042697, 3)

# Replacing missing rating with NaN

In [9]:
rating.loc[rating.rating == -1, 'rating'] = np.NaN
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


# Create index for anime name

In [10]:
anime_index = pd.Series(anime.index, index=anime.name)
anime_index.head()

name
Kimi no Na wa.                      0
Fullmetal Alchemist: Brotherhood    1
Gintama°                            2
Steins;Gate                         3
Gintama&#039;                       4
dtype: int64

# Join the data

In [11]:
joined = anime.merge(rating, how='inner', on='anime_id')
joined.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,99,5.0
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10.0
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10.0
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10.0
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,278,


# Create a pivot table

In [12]:
joined = joined[['user_id', 'name', 'rating_y']]

pivot = pd.pivot_table(joined, index='name', columns='user_id', values='rating_y')
pivot.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,,,,,,,,,,,...,,,,,,,,,,
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,6.0,,,


In [13]:
pivot.shape

(1490, 9412)

# Drop all users that never rate an anime

In [14]:
pivot.dropna(axis=1, how='all', inplace=True)
pivot.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,,,,,,,,,,,...,,,,,,,,,,
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,6.0,,,


In [15]:
pivot.shape

(1490, 9412)

# Center the mean around 0 (centered cosine / pearson)

In [16]:
pivot_norm = pivot.apply(lambda x: x - np.nanmean(x), axis=1)
pivot_norm.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,,,,,,,,,,,...,,,,,,,,,,
.hack//Roots,,,,,,,,,,,...,,,,,,,,,,
.hack//Sign,,,,,,,,,,,...,,,,,,,,,,
.hack//Tasogare no Udewa Densetsu,,,,,,,,,,,...,,,,,,,,,,
07-Ghost,,,,,,,,,,,...,,,,,,,-1.598527,,,


# Item Based Collaborative Filtering

In [17]:
# fill NaN with 0
pivot_norm.fillna(0, inplace=True)
pivot_norm.head()

user_id,1,2,3,5,7,8,9,10,11,12,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.598527,0.0,0.0,0.0


## Calculate Similar Items

In [18]:
# convert into dataframe to make it easier
item_sim_df = pd.DataFrame(cosine_similarity(pivot_norm, pivot_norm), index=pivot_norm.index, columns=pivot_norm.index)
item_sim_df.head()

name,&quot;Bungaku Shoujo&quot; Movie,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,11eyes,30-sai no Hoken Taiiku,91 Days,A-Channel,AKB0048,...,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Movie,1.0,0.029269,0.021465,0.054258,0.029828,0.017774,-0.024699,-0.040539,0.032808,0.015381,...,0.022924,0.067584,0.029914,0.057437,0.092676,0.053568,-0.00782,0.028904,0.062385,0.030419
.hack//Roots,0.029269,1.0,0.27877,0.336105,0.099,0.089652,0.015528,0.003461,0.068265,0.065131,...,0.0115,0.039758,0.072426,0.008782,0.04992,0.053877,0.045409,0.038875,0.066537,0.058233
.hack//Sign,0.021465,0.27877,1.0,0.304976,0.079774,0.044298,0.04437,0.005705,0.046192,0.015933,...,0.031619,0.045066,0.079867,0.015682,0.085431,0.055246,0.074836,0.064887,0.037908,0.064085
.hack//Tasogare no Udewa Densetsu,0.054258,0.336105,0.304976,1.0,0.069433,0.063063,-0.008267,0.018654,0.058033,0.046196,...,0.03698,0.043281,0.091346,0.0,0.064953,0.039185,0.085909,0.021216,0.067155,0.060521
07-Ghost,0.029828,0.099,0.079774,0.069433,1.0,0.122958,0.034273,0.020435,0.066753,0.04778,...,0.047591,0.024697,0.118567,0.01462,0.046009,0.033981,0.074668,0.019609,0.033613,0.031543


In [19]:
def get_similar_anime(anime_name):
    if anime_name not in pivot_norm.index:
        return None, None
    else:
        sim_animes = item_sim_df.sort_values(by=anime_name, ascending=False).index[1:]
        sim_score = item_sim_df.sort_values(by=anime_name, ascending=False).loc[:, anime_name].tolist()[1:]
        return sim_animes, sim_score

In [20]:
animes, score = get_similar_anime("Steins;Gate")
for x,y in zip(animes[:10], score[:10]):
    print("{} with similarity of {}".format(x, y))

Steins;Gate Movie: Fuka Ryouiki no Déjà vu with similarity of 0.25714456940512986
Fullmetal Alchemist: Brotherhood with similarity of 0.18664806845268003
Code Geass: Hangyaku no Lelouch R2 with similarity of 0.16413003201158918
Suzumiya Haruhi no Shoushitsu with similarity of 0.16231289638293878
Angel Beats! with similarity of 0.16095385954159877
Tengen Toppa Gurren Lagann with similarity of 0.15634872712986464
Fate/Zero 2nd Season with similarity of 0.15520331915668556
Psycho-Pass with similarity of 0.15476024287681137
Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai. with similarity of 0.15111260584785854
Fate/Zero with similarity of 0.15052446938924285


## Helper Function

In [21]:
# predict the rating of anime x by user y
def predict_rating(user_id, anime_name, max_neighbor=10):
    animes, scores = get_similar_anime(anime_name)
    anime_arr = np.array([x for x in animes])
    sim_arr = np.array([x for x in scores])
    
    # select only the anime that has already rated by user x
    filtering = pivot_norm[user_id].loc[anime_arr] != 0
    
    # calculate the predicted score
    s = np.dot(sim_arr[filtering][:max_neighbor], pivot[user_id].loc[anime_arr[filtering][:max_neighbor]]) \
            / np.sum(sim_arr[filtering][:max_neighbor])
    
    return s

In [22]:
predict_rating(3, "Steins;Gate")

8.4173726792034813

In [23]:
predict_rating(3, "Cowboy Bebop")

8.523652293486963

## Get Recommendation

In [24]:
# recommend top n_anime for user x based on item collaborative filtering algorithm
def get_recommendation(user_id, n_anime=10):
    predicted_rating = np.array([])
    
    for _anime in pivot_norm.index:
        predicted_rating = np.append(predicted_rating, predict_rating(user_id, _anime))
    
    # don't recommend something that user has already rated
    temp = pd.DataFrame({'predicted':predicted_rating, 'name':pivot_norm.index})
    filtering = (pivot_norm[user_id] == 0.0)
    temp = temp.loc[filtering.values].sort_values(by='predicted', ascending=False)

    # recommend n_anime anime
    return anime.loc[anime_index.loc[temp.name[:n_anime]]]

In [25]:
get_recommendation(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
53,6114,Rainbow: Nisha Rokubou no Shichinin,"Drama, Historical, Seinen, Thriller",TV,26,8.64,139474
108,7655,Major S6,"Comedy, Drama, Sports",TV,25,8.49,24788
44,19647,Hajime no Ippo: Rising,"Comedy, Drama, Shounen, Sports",TV,25,8.68,66756
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
254,18689,Diamond no Ace,"Comedy, School, Shounen, Sports",TV,75,8.25,81384
475,28,Yakitate!! Japan,"Comedy, Shounen",TV,69,8.05,50876
357,18179,Yowamushi Pedal,"Comedy, Drama, Shounen, Sports",TV,38,8.16,91138
906,26,Texhnolyze,"Action, Drama, Psychological, Sci-Fi",TV,22,7.78,95859
237,24277,Yowamushi Pedal: Grande Road,"Comedy, Drama, Shounen, Sports",TV,24,8.28,50778
88,5941,Cross Game,"Comedy, Drama, Romance, School, Sports",TV,50,8.53,53108


In [26]:
get_recommendation(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
65,7472,Gintama Movie: Shinyaku Benizakura-hen,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,8.59,51185
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
170,513,Tenkuu no Shiro Laputa,"Adventure, Fantasy, Romance, Sci-Fi",Movie,1,8.38,151061
983,584,Noein: Mou Hitori no Kimi e,"Adventure, Drama, Sci-Fi, Slice of Life",TV,24,7.74,56244
496,416,Kurenai no Buta,"Action, Adventure, Comedy, Drama, Historical, ...",Movie,1,8.03,74028
148,558,Major S2,"Comedy, Drama, Shounen, Sports",TV,26,8.41,30689
71,578,Hotaru no Haka,"Drama, Historical",Movie,1,8.58,174878
834,2593,Kara no Kyoukai 1: Fukan Fuukei,"Action, Mystery, Supernatural, Thriller",Movie,1,7.81,178880
46,10379,Natsume Yuujinchou San,"Drama, Fantasy, Shoujo, Slice of Life, Superna...",TV,13,8.67,102322
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855


Compared to the user based collaborative filtering, the recommendation given is different. As we know, in the user based, so many user did not rate the anime. In the item based however, I think it is more robust because there is no anime that never rated by users.