## Making a simple recommender system to gauge the similarity between shows, users and to help predict whether a user will enjoy a particular anime.

In [19]:


# Import relevant libraries 

import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator
%matplotlib inline

In [20]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

Before alteration the ratings dataset uses a "-1" to represent missing ratings.
Replacing these placeholders with a null value because I will later be calculating 
the average rating per user and don't want the average to be distorted.

In [21]:
rating.rating.replace({-1: np.nan}, regex=True, inplace = True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [22]:
anime_tv = anime[anime['type']=='TV']
anime_tv.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [23]:
# Joining the two dataframes on the anime_id columns

merged = rating.merge(anime_tv, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)

In [24]:
# limiting the dataframe length to 10,000 users

merged=merged[['user_id', 'name', 'user_rating']]
merged_sub= merged[merged.user_id <= 10000]
merged_sub.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,3,Naruto,8.0
2,5,Naruto,6.0
3,6,Naruto,
4,10,Naruto,


For collaborative filtering we'll need to create a pivot table of users on one axis and tv show names along the other. The pivot table will help us in defining the similarity between users and shows to better predict who will like what.

In [25]:
piv = merged_sub.pivot_table(index=['user_id'], columns=['name'], values='user_rating')

In [26]:
print(piv.shape)
piv.head()

(9387, 2708)


name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,2.0,
7,,,,,,,,,,,...,,,,,,,,,,


In [27]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalize the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)


# Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm.head()


user_id,3,5,7,8,10,11,12,14,16,17,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.519231,0.0,0.0,0.0


In [28]:
# Our data needs to be in a sparse matrix format to be read by the similarity functions

piv_sparse = sp.sparse.csr_matrix(piv_norm.values)


These matrices show us the computed cosine similarity values 
between each user/user array pair and item/item array pair.

In [29]:
iteitem_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
user_simm_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [30]:
# Inserting the similarity matricies into dataframe objects

item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)
item_sim_df.head()

name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,"Zone of the Enders: Dolores, I",Zukkoke Knight: Don De La Mancha,ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.0,0.174484,0.252684,-0.005793,0.049693,0.045349,0.007642,0.0,0.03334,-0.031345,...,-0.015856,0.0,-0.022097,-0.011505,0.0,0.0,0.027313,0.025449,-0.046627,-0.04133
.hack//Sign,0.174484,1.0,0.159591,0.020331,0.052251,0.028638,-0.002524,0.0,0.035391,-0.035378,...,-0.00776,0.0,-0.011179,-0.015013,-0.009433,-0.016342,-0.010947,0.025278,-0.008932,-0.03574
.hack//Tasogare no Udewa Densetsu,0.252684,0.159591,1.0,0.046787,0.026241,0.039369,-0.009706,0.0,0.001175,0.004135,...,0.009735,0.0,-0.021021,-0.02429,0.0,0.0,0.007509,0.020202,-0.04699,-0.028177
009-1,-0.005793,0.020331,0.046787,1.0,0.000486,0.011115,0.06593,0.0,0.057343,-0.074427,...,0.007241,0.0,0.003136,0.007257,0.0,0.0,0.0,-0.032581,0.007404,0.015191
07-Ghost,0.049693,0.052251,0.026241,0.000486,1.0,0.083305,0.009228,-0.009119,0.066026,-0.027582,...,0.007254,0.0,-0.032382,-0.033779,6e-05,4.4e-05,0.016763,0.017641,-0.032393,-0.035078


In [31]:
# This function will return the top 10 shows with the highest cosine similarity value

def top_animes(anime_name):
    count = 1
    print('Similar shows to {} include:\n'.format(anime_name))
    for item in item_sim_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1  

In [32]:
# This function will return the top 5 users with the highest similarity value 

def top_users(user):
    
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim)) 

In [33]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]    

In [34]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)    

In [45]:
top_animes('Naruto')

Similar shows to Naruto include:

No. 1: Bleach
No. 2: Yu☆Gi☆Oh! Duel Monsters
No. 3: Dragon Ball GT
No. 4: Pokemon Advanced Generation
No. 5: Arashi no Yoru ni: Himitsu no Tomodachi
No. 6: Dragon Ball Z
No. 7: Domo TV
No. 8: Green Green
No. 9: Lemon Angel (1988/II)
No. 10: Mizuiro Jidai


In [46]:
top_users(5)

Most Similar Users:

User #4512, Similarity value: 0.28
User #2300, Similarity value: 0.27
User #4064, Similarity value: 0.25
User #1682, Similarity value: 0.25
User #7407, Similarity value: 0.23
User #3117, Similarity value: 0.23
User #3056, Similarity value: 0.23
User #529, Similarity value: 0.22
User #771, Similarity value: 0.22
User #9319, Similarity value: 0.22


In [47]:
similar_user_recs(7)

[('Steins;Gate', 8),
 ('Clannad: After Story', 7),
 ('Fullmetal Alchemist: Brotherhood', 6),
 ('Shigatsu wa Kimi no Uso', 5),
 ('Code Geass: Hangyaku no Lelouch', 4)]

In [44]:
predicted_rating('Naruto', 3)

7.970772057795193

Below we'll see how the predict_rating function performs compared to the observed rated values for user 3.

In [39]:
# Creates a list of every show watched by user 

watched = piv.T[piv.loc[3,:]>0].index.tolist()

In [40]:
# Make a list of the squared errors between actual and predicted value

errors = []
for i in watched:
    actual=piv.loc[3, i]
    predicted = predicted_rating(i, 3)
    errors.append((actual-predicted)**2)

In [41]:
# This is the average squared error for user 3
np.mean(errors)

0.8848212679397977