### Data Importing

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [2]:
anime['anime_id'] = anime['anime_id'].astype(object)

### Data Cleaning

In [3]:
anime.episodes.replace({'Unknown':np.nan},inplace = True)


In [4]:
anime.dropna(inplace=True)

In [5]:
anime['episodes'] = anime['episodes'].astype('int64')

In [6]:
import re
import string

In [7]:
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    
    return text
anime['name'] = anime['name'].apply(text_cleaning)

In [8]:
df = pd.merge(anime,rating,how="inner",on='anime_id')

In [9]:
df.rename(columns = {'rating_x':'average_rating'},inplace=True)
df.rename(columns = {'rating_y':'user_rating'},inplace=True)

In [10]:
df['user_id'] = df['user_id'].astype(object)

In [11]:
df.drop_duplicates(subset=['user_id','anime_id'],inplace=True)

In [12]:
df.reset_index(drop=True,inplace=True)

### Baseline RMSE for user rating prediction model 

In [13]:
np.sqrt(np.square(df.user_rating - df.average_rating).sum()/len(df))

3.9512856687796396

### Collaborative filtering - User rating prediction

In [15]:
min_anime_ratings = 500
filter_anime = df['anime_id'].value_counts() > min_anime_ratings
filter_anime = filter_anime[filter_anime].index.tolist()

min_user_ratings = 500
filter_users = df['user_id'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = df[(df['anime_id'].isin(filter_anime)) & (df['user_id'].isin(filter_users))]
df_new["user_rating"].replace({-1: np.nan}, inplace=True)
df_new.dropna(inplace=True)

df_full = df[(df['anime_id'].isin(filter_anime)) & (df['user_id'].isin(filter_users))]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.dropna(inplace=True)


In [16]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import SVD, SlopeOne, NMF, KNNWithZScore
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

In [17]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df_new[['user_id', 'anime_id', 'user_rating']], reader)

In [21]:
b = []
for algo in [SVD(biased=True), SlopeOne(), NMF(), KNNWithZScore()]:
    results = cross_validate(algo, data, measures=['RMSE'], cv=4, verbose=False)
    temp = pd.DataFrame.from_dict(results).mean(axis=0)
    temp = temp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    b.append(temp)

KeyboardInterrupt: 

In [22]:
surprise_results = pd.DataFrame(b).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.091011,34.832547,1.935018
SlopeOne,1.152421,16.317017,98.17671


In [23]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = SVD()
pred = algo.fit(trainset).test(testset)
accuracy.rmse(pred)

RMSE: 1.0877


1.087743107488587

In [24]:
df_ = pd.DataFrame(pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_['err'] = abs(df_.est - df_.rui)
np.sqrt(np.square(df_.err).sum()/len(df_))

1.087743107488587

In [25]:
data_pred = []
for row in df_full[df_full['user_rating'] == -1].itertuples():
    temp_pred = algo.predict(row[8], row[0])
    data_pred.append((temp_pred[0], temp_pred[1], round(temp_pred[3])))
data_pred = pd.DataFrame(data_pred, columns=['user_id', 'anime_id', 'rating'])

In [26]:
data_pred

Unnamed: 0,user_id,anime_id,rating
0,917,18,8.0
1,2243,63,7.0
2,6164,184,7.0
3,6525,189,9.0
4,11594,295,8.0
...,...,...,...
221898,56971,7812481,8.0
221899,57995,7812520,7.0
221900,58736,7812556,7.0
221901,66118,7812792,8.0


### Content-based filtering

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

anime['genre'] = anime['genre'].fillna('')
genres_str = anime['genre'].str.split(',').astype(str)
tfv_matrix = tfv.fit_transform(genres_str)

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

sig = cosine_similarity(tfv_matrix, tfv_matrix)
indices = pd.Series(anime.index, index=anime['name']).drop_duplicates()

In [29]:
def recommend(title, sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    sig_scores = sig_scores[1:11]
    anime_indices = [i[0] for i in sig_scores]
    scores = [i[1] for i in sig_scores]

    return pd.DataFrame({'Anime name': anime['name'].iloc[anime_indices].values,
                                 'Rating': anime['rating'].iloc[anime_indices].values, 
                        'Similarity': scores, 'Genre': anime['genre'].iloc[anime_indices].values})

In [30]:
pd.DataFrame(sig)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11820,11821,11822,11823,11824,11825,11826,11827,11828,11829
0,1.000000,0.022913,0.000000,0.000000,0.000000,0.072212,0.000000,0.032772,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.022913,1.000000,0.025350,0.000000,0.025350,0.040403,0.085953,0.072335,0.025350,0.025350,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.025350,1.000000,0.068169,1.000000,0.028188,0.033215,0.056933,1.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.068169,1.000000,0.068169,0.000000,0.000000,0.106298,0.068169,0.068169,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.025350,1.000000,0.068169,1.000000,0.028188,0.033215,0.056933,1.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11825,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
11826,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
11827,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
11828,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
recommend('Pokemon')

Unnamed: 0,Anime name,Rating,Similarity,Genre
0,Hanasaku Iroha: Home Sweet Home,7.99,1.0,"Comedy, Drama, Slice of Life"
1,Tamayura: Sotsugyou Shashin Part 3 - Akogare,7.88,1.0,"Comedy, Drama, Slice of Life"
2,Tamayura: Sotsugyou Shashin Part 2 - Hibiki,7.85,1.0,"Comedy, Drama, Slice of Life"
3,Tamayura: Sotsugyou Shashin Part 1 - Kizashi,7.69,1.0,"Comedy, Drama, Slice of Life"
4,Tamayura: More Aggressive,7.54,1.0,"Comedy, Drama, Slice of Life"
5,Tamayura: More Aggressive - Tsuitachi dake no ...,7.49,1.0,"Comedy, Drama, Slice of Life"
6,"Tamayura: Hitotose - Attakai Kaze no Omoide, N...",7.44,1.0,"Comedy, Drama, Slice of Life"
7,Tamayura: Hitotose,7.32,1.0,"Comedy, Drama, Slice of Life"
8,Jarinko Chie,7.26,1.0,"Comedy, Drama, Slice of Life"
9,Tamayura,7.23,1.0,"Comedy, Drama, Slice of Life"


### Collaborative KNN recommendation

In [34]:
anime_rating1=df.copy()
anime_rating1.drop_duplicates(subset=['user_id','name'],inplace=True)
anime_rating1.reset_index(drop=True,inplace=True)
counts = anime_rating1['user_id'].value_counts()
anime_feature = anime_rating1[anime_rating1['user_id'].isin(counts[counts >= 500].index)]

anime_rating_pivot = anime_feature.pivot(index='name', columns='user_id', values='user_rating')
anime_rating_pivot = anime_rating_pivot.fillna(0)


In [35]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine')
model_knn.fit(anime_rating_pivot)
distances, indices = model_knn.kneighbors(anime_rating_pivot, n_neighbors=11)

In [36]:
recommend = pd.DataFrame(indices, columns=['anime0', 'anime1', 'anime2', 'anime3', 'anime4', 'anime5','anime6', 'anime7', 'anime8', 'anime9', 'anime10'])
recommend.head()

Unnamed: 0,anime0,anime1,anime2,anime3,anime4,anime5,anime6,anime7,anime8,anime9,anime10
0,0,7284,1806,2122,10584,6767,1746,6304,2302,7076,7137
1,8741,6877,6885,6883,2490,9144,2502,6895,6870,10216,6869
2,8741,6877,6885,6883,2490,9144,2502,6895,6870,10216,6869
3,3,2712,9369,9407,4345,4341,4324,4309,9368,9284,4252
4,4,1653,5994,9724,9899,571,8158,611,1802,1656,3846


In [37]:
recommend2 = recommend.copy()
for i in range(0, 11):
    animes = pd.DataFrame(anime_rating_pivot.index).reset_index()
    animes = animes.rename(columns={'index':f'anime{i}'})
    recommend2 = pd.merge(recommend2, animes, on=[f'anime{i}'], how='left')
    recommend2 = recommend2.drop(f'anime{i}', axis=1)
    recommend2 = recommend2.rename(columns={'name':f'anime{i}'})

In [38]:
knn_output = recommend2[recommend2.anime0=='Pokemon'].T
knn_output.index=np.arange(0,11)
knn_output.columns=['Anime name']
knn_output.iloc[1:11,:]

Unnamed: 0,Anime name
1,Pokemon Advanced Generation
2,Pokemon: Kesshoutou no Teiou Entei
3,Pokemon: Mewtwo no Gyakushuu
4,Pokemon: Maboroshi no Pokemon Lugia Bakutan
5,Pokemon: Celebi Toki wo Koeta Deai
6,Digimon Adventure
7,Pokemon Advanced Generation: Mew to Hadou no Y...
8,Pokemon: Mizu no Miyako no Mamorigami Latias t...
9,Pokemon Advanced Generation: Rekkuu no Houmons...
10,Pokemon Advanced Generation: Nanayo no Negaibo...


### Collaborative SVD recommender

In [39]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=1000)
svd.fit(anime_rating_pivot)

anime_rating_svd = svd.fit_transform(anime_rating_pivot)
corr = np.corrcoef(anime_rating_svd)

In [40]:
anime_title = anime_rating_pivot.index
anime_list = list(anime_title)

anime_pokemon = anime_list.index('Pokemon')
corr_anime_pokemon = corr[anime_pokemon]
svd_output = anime_title[(corr_anime_pokemon)>0.61]

In [41]:
svd_output = pd.DataFrame(svd_output)
svd_output = svd_output[svd_output.name != 'Pokemon']
svd_output.index=np.arange(1,11)
svd_output.columns=['Anime name']
svd_output

Unnamed: 0,Anime name
1,Digimon Adventure
2,Pokemon Advanced Generation
3,Pokemon Advanced Generation: Mew to Hadou no Y...
4,Pokemon Advanced Generation: Rekkuu no Houmons...
5,Pokemon Diamond and Pearl
6,Pokemon: Celebi Toki wo Koeta Deai
7,Pokemon: Kesshoutou no Teiou Entei
8,Pokemon: Maboroshi no Pokemon Lugia Bakutan
9,Pokemon: Mewtwo no Gyakushuu
10,Pokemon: Mizu no Miyako no Mamorigami Latias t...
