In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import tqdm as tqdm
import tensorflow as tf
import nltk
import re
import random
import unicodedata
import string

In [3]:
#read the csv files
anime = pd.read_csv('/content/drive/MyDrive/anime_recommendation/archive/anime.csv')
rating = pd.read_csv('/content/drive/MyDrive/anime_recommendation/archive/rating.csv')

In [4]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')
def clean_text(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub("(\\W)"," ",text)
    text = re.sub('\S*\d\S*\s*','', text)
    #text =  "<sos> " +  text + " <eos>"
    return text

In [5]:
anime.dropna(inplace=True)

In [6]:
for i in range(len(anime)):
    anime['genre'].iloc[i] = clean_text(anime['genre'].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime['genre'].iloc[i] = clean_text(anime['genre'].iloc[i])


Our recommendation system actually provides recommendations based on users' anime watching habits and ratings.

So, we suggest
1. Movies that are liked by other users who have similar tastes
2. Movies having similar genre

If our user hasn't watched these anime series before, we definitely recommend them.

This data set contains information on user preference data from 73,516 users on 12,294 anime. Each user is able to add anime to their completed list and give it a rating and this data set is a compilation of those ratings.

In [7]:
# Finding the correlation between the movie genres
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
anime_tv = anime[anime['type'] == 'TV']
cv = CountVectorizer(ngram_range=(1,1))
tfidf_matrix = cv.fit_transform(anime_tv['genre']).toarray()

In [9]:
X_train = pd.DataFrame(tfidf_matrix)
similarity = sklearn.metrics.pairwise.cosine_similarity(X_train, Y=None, dense_output=True)

In [10]:
sim_df = pd.DataFrame(similarity)
genre_corr = pd.DataFrame(sim_df.unstack().sort_values(ascending=False), columns=["corr"])
genre_corr.index.names = ["animeid_1", "animeid_2"]
genre_corr = genre_corr[genre_corr['corr']>0.55]

In [11]:
genre_corr.shape

(1067312, 1)

In [12]:
genre_corr.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,corr
animeid_1,animeid_2,Unnamed: 2_level_1
2381,682,0.555556
783,2248,0.555556
2941,116,0.555556
2381,710,0.555556
116,132,0.555556
2941,1256,0.555556
2941,1261,0.555556
1292,3207,0.555556
303,395,0.555556
343,116,0.555556


In [13]:
anime_tv.iloc[343]

anime_id                                                 6033
name                                          Dragon Ball Kai
genre       action adventure comedy fantasy martial arts s...
type                                                       TV
episodes                                                   97
rating                                                   7.95
members                                                116832
Name: 588, dtype: object

In [14]:
anime_tv.iloc[116]

anime_id                                                 6811
name                                   InuYasha: Kanketsu-hen
genre       action adventure comedy demons fantasy magic r...
type                                                       TV
episodes                                                   26
rating                                                   8.37
members                                                 99128
Name: 173, dtype: object

In [15]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,drama romance school supernatural,Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665
2,28977,Gintama°,action comedy historical parody samurai scifi ...,TV,51,9.25,114262
3,9253,Steins;Gate,scifi thriller,TV,24,9.17,673572
4,9969,Gintama&#039;,action comedy historical parody samurai scifi ...,TV,51,9.16,151266


In [16]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


Some users have watched some anime shows but did not rate it. These show up as -1 in the dataset and we are taking them off because we do not want to recommend an anime that user already watched

In [17]:
watched_but_not_rated = rating.loc[rating["rating"] < 0,("user_id","anime_id")]
rating = rating[rating["rating"] >= 0]
rating.columns = ['user_id', 'anime_id', 'user_rating']

In [18]:
df = anime.merge(rating, how="left", on="anime_id")
df.dropna(inplace=True)

### Lets say we want to make predictions for TV type animes

In [19]:
df = df[df["type"]=="TV"]

In [20]:
df.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating
1961,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,3.0,10.0
1962,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,10.0,10.0
1963,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,11.0,8.0
1964,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,12.0,9.0
1965,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,17.0,10.0
1966,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,19.0,9.0
1967,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,21.0,9.0
1968,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,30.0,10.0
1969,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,38.0,10.0
1970,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,39.0,10.0


In [21]:
#Drop Anime movies have least votes
anime_watched_count = pd.DataFrame(df.anime_id.value_counts())
anime_watched_count.columns = ["total_vote"]   # it was ['anime_id']
anime_watched_count = anime_watched_count[anime_watched_count["total_vote"] > 250]

In [22]:
anime_watched_count

Unnamed: 0,total_vote
1535,34226
11757,26310
16498,25290
1575,24126
6547,23565
...,...
971,255
342,255
2041,254
21267,254


In [23]:
common_anime = anime_watched_count.index
df = df[df["anime_id"].isin(common_anime)]
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating
1961,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,3.0,10.0
1962,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,10.0,10.0
1963,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,11.0,8.0
1964,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,12.0,9.0
1965,5114,Fullmetal Alchemist: Brotherhood,action adventure drama fantasy magic military ...,TV,64,9.26,793665,17.0,10.0


In [24]:
user_movie = df.pivot_table(index="user_id", columns="anime_id", values="user_rating")
user_movie.head()

anime_id,1,6,7,8,15,16,17,18,19,20,...,32681,32696,32729,32828,32935,32947,32998,33028,33222,33421
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,8.0,...,,,,,,,,,,
5.0,,8.0,,,6.0,,6.0,6.0,,6.0,...,,,,7.0,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,


#### Lets take a random user from user_movie table in order to make recommendations to that person

In [25]:
rnd_user = int(pd.Series(user_movie.index).sample(1, random_state=28))
rnd_user_df = user_movie[user_movie.index == rnd_user]
rnd_user

47697

In [26]:
#The number of movies watched by user 47697
animes_watched = rnd_user_df.columns[~rnd_user_df.isnull().any()].tolist()
len(animes_watched)

70

In [27]:
#Lets take random movie from movies watched by user 47697
rnd_anime = int(pd.Series(rnd_user_df.columns[~rnd_user_df.isnull().any()]).sample(1, random_state=42).values)
rnd_anime

341

#### Lets predict anime of similar genre as 341

In [28]:
genre_corr.loc[341]

Unnamed: 0_level_0,corr
animeid_2,Unnamed: 1_level_1
341,1.000000
376,1.000000
236,1.000000
309,1.000000
895,0.894427
...,...
464,0.566947
379,0.566947
355,0.566947
767,0.566947


In [29]:
anime_tv.iloc[341]

anime_id                             1472
name                        City Hunter 3
genre       action comedy mystery shounen
type                                   TV
episodes                               13
rating                               7.95
members                              6723
Name: 584, dtype: object

In [30]:
anime_tv.iloc[236]

anime_id                             1471
name                        City Hunter 2
genre       action comedy mystery shounen
type                                   TV
episodes                               63
rating                               8.12
members                              8199
Name: 390, dtype: object

#### Now, lets predict based on other users who have same taste as you

In [31]:
animes_watched_df = user_movie[animes_watched]
animes_watched_df.iloc[0:5,0:5]

anime_id,45,50,52,60,66
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,,,,,
2.0,,,,,
3.0,,,,,
5.0,7.0,,,,
7.0,,,,,


In [32]:
user_watched_count = animes_watched_df.T.notnull().sum()
user_watched_count = user_watched_count.reset_index()
user_watched_count.columns = ["user_id", "count"]
user_watched_count.head()

Unnamed: 0,user_id,count
0,1.0,0
1,2.0,0
2,3.0,4
3,5.0,20
4,7.0,14


In [34]:
same_anime = user_watched_count[user_watched_count["count"] > len(animes_watched)/2.5].\
    sort_values(by="count", ascending=False)

len(same_anime)

1344

In [35]:
same_anime.head()

Unnamed: 0,user_id,count
44643,47697.0,70
23406,24988.0,66
10699,11398.0,61
42730,45659.0,61
39886,42635.0,61


In [36]:
final_df = pd.concat([animes_watched_df[animes_watched_df.index.isin(same_anime["user_id"].values)],
                      rnd_user_df[animes_watched]])

In [37]:
final_df

anime_id,45,50,52,60,66,72,76,77,101,120,...,6045,6547,7054,7674,9989,10800,14397,14713,20971,23289
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
226.0,8.0,8.0,,7.0,,7.0,,,8.0,,...,7.0,9.0,7.0,,,,,9.0,,10.0
248.0,9.0,7.0,,,,,,,,,...,10.0,8.0,9.0,9.0,8.0,9.0,9.0,,,
294.0,7.0,8.0,,,,7.0,,,6.0,8.0,...,9.0,9.0,10.0,9.0,9.0,9.0,9.0,7.0,,8.0
317.0,,,,9.0,7.0,,8.0,8.0,,8.0,...,,8.0,9.0,7.0,7.0,,,,,7.0
478.0,,,9.0,,,,9.0,9.0,9.0,,...,10.0,10.0,,9.0,10.0,10.0,10.0,8.0,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73356.0,7.0,7.0,7.0,9.0,,7.0,8.0,8.0,7.0,7.0,...,10.0,,9.0,,,7.0,,6.0,,
73378.0,9.0,8.0,,,,9.0,,,9.0,,...,8.0,8.0,8.0,9.0,10.0,9.0,9.0,,,9.0
73417.0,7.0,,,6.0,,8.0,,,7.0,7.0,...,8.0,9.0,9.0,,7.0,8.0,7.0,7.0,,7.0
73499.0,10.0,8.0,,,,,,,9.0,,...,9.0,9.0,9.0,9.0,10.0,9.0,9.0,,,7.0


In [38]:
corrdf = pd.DataFrame(final_df.T.corr().unstack().sort_values().drop_duplicates(), columns=["corr"])
corrdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,corr
user_id,user_id,Unnamed: 2_level_1
14959.0,55382.0,-1.0
7446.0,58874.0,-0.945343
10761.0,55094.0,-0.935971
28597.0,41889.0,-0.9259
59966.0,67489.0,-0.918947


In [39]:
corrdf.index.names = ["user_1", "user_2"]
corrdf.reset_index(inplace=True)
corrdf.head()

Unnamed: 0,user_1,user_2,corr
0,14959.0,55382.0,-1.0
1,7446.0,58874.0,-0.945343
2,10761.0,55094.0,-0.935971
3,28597.0,41889.0,-0.9259
4,59966.0,67489.0,-0.918947


In [40]:
corrdf = corrdf[(corrdf["user_1"] == rnd_user) & (corrdf["corr"] > 0.55)]
corrdf = corrdf.sort_values(by="corr", ascending=False)
corrdf = corrdf.drop(columns="user_1").rename(columns={"user_2": "user_id"})
corrdf.head()

Unnamed: 0,user_id,corr
859963,61389.0,0.661313
843738,44322.0,0.601413
842347,67006.0,0.597483
831291,25693.0,0.56997


In [41]:
top_user = corrdf.merge(rating[["user_id", "anime_id", "user_rating"]], how='inner')
top_user = top_user[top_user["user_id"] != rnd_user]
top_user["user_id"].nunique()

4

In [42]:
top_user

Unnamed: 0,user_id,corr,anime_id,user_rating
0,61389.0,0.661313,1,9
1,61389.0,0.661313,5,8
2,61389.0,0.661313,15,7
3,61389.0,0.661313,30,8
4,61389.0,0.661313,31,8
...,...,...,...,...
2033,25693.0,0.569970,24075,10
2034,25693.0,0.569970,24135,10
2035,25693.0,0.569970,24455,10
2036,25693.0,0.569970,25159,10


In [43]:
top_user["wr"] = top_user["corr"] * top_user["user_rating"]
recommend = top_user.sort_values(by="wr", ascending=False)["anime_id"]

In [44]:
#if user have alredy watched but havent give rate drop them
wched_l = watched_but_not_rated[watched_but_not_rated['user_id'] == rnd_user]["anime_id"].tolist()
out = [x for x in recommend.tolist() if x not in wched_l]

In [45]:
# Giving Output As Anime Names
def anime_id_to_name(ıd):
    qut = pd.DataFrame(anime[anime["anime_id"].isin(ıd)]["name"])
    return qut

Recommendations = anime_id_to_name(out)
Recommendations[:10]

Unnamed: 0,name
1,Fullmetal Alchemist: Brotherhood
3,Steins;Gate
4,Gintama&#039;
6,Hunter x Hunter (2011)
8,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...
9,Gintama&#039;: Enchousen
10,Clannad: After Story
12,Gintama
13,Code Geass: Hangyaku no Lelouch R2
15,Sen to Chihiro no Kamikakushi
