In [1]:
# Importing Libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [12]:
# Reading Dataset

In [13]:
df = pd.read_csv('anime.csv')

In [14]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [16]:
# We can drop 'anime_id' as it will not contribute to cosine similarity
df.drop('anime_id', inplace=True, axis=1)

In [17]:
df.shape

(12294, 6)

In [18]:
# The rows with null values must be dropped to avoid confusion while deriving consine similarity matrix
df.dropna(axis=0, inplace=True)

In [19]:
df['genre'] = df['genre'].map(lambda x: x.lower().split(','))

The anime name to be displayed in recommendations must be stripped off of all unnecessary characters

In [20]:
df['anime_name'] = df['name'].map(lambda name: re.sub('[.,@#$%^&*{}°;?!]',' ',name))

The 'name' field must have only alphabets and digits before passing it on for the cosine similarity matrix formation

In [21]:
df['name'] = df['name'].map(lambda name: re.sub(' ',''," ".join(re.findall('[a-zA-Z0-9]',name.lower()))))

In [22]:
df.head()

Unnamed: 0,name,genre,type,episodes,rating,members,anime_name
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,1,9.37,200630,Kimi no Na wa
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,64,9.26,793665,Fullmetal Alchemist: Brotherhood
2,gintama,"[action, comedy, historical, parody, samur...",TV,51,9.25,114262,Gintama
3,steinsgate,"[sci-fi, thriller]",TV,24,9.17,673572,Steins Gate
4,gintama039,"[action, comedy, historical, parody, samur...",TV,51,9.16,151266,Gintama 039


In [23]:
# Converting object datatype to int
df['episodes'] = df['episodes'].map(lambda x: int(x) if x != 'Unknown' else None)

In [24]:
df['anime_size'] = ''

In [25]:
# new column 'anime_size' has information about the size of the anime based on number of episodes
df['anime_size'] = df['episodes'].map(lambda x: 'tiny' if (0<x<=1) 
                                      else ('small' if (1<x<=50) 
                                            else ('medium' if (50<x<=100) 
                                                  else ('large' if (100<x<=200) 
                                                        else ('verylarge' if (200<x<=500) 
                                                              else 'superlarge')))))

In [26]:
df['popularity'] = ''

In [27]:
# new column 'popularity' will have information about the fan following of each anime
df['popularity'] = df['members'].map(lambda x: 'low' if (0<x<=1000) 
                                      else ('moderate' if (1000<x<=10000) 
                                            else ('high' if (10000<x<=100000) 
                                                  else 'veryhigh')))

In [28]:
df.head()

Unnamed: 0,name,genre,type,episodes,rating,members,anime_name,anime_size,popularity
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,1.0,9.37,200630,Kimi no Na wa,tiny,veryhigh
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,64.0,9.26,793665,Fullmetal Alchemist: Brotherhood,medium,veryhigh
2,gintama,"[action, comedy, historical, parody, samur...",TV,51.0,9.25,114262,Gintama,medium,veryhigh
3,steinsgate,"[sci-fi, thriller]",TV,24.0,9.17,673572,Steins Gate,small,veryhigh
4,gintama039,"[action, comedy, historical, parody, samur...",TV,51.0,9.16,151266,Gintama 039,medium,veryhigh


In [29]:
# We must consider only the data relevant for formulating cosine similarity matrix and drop all other columns.¶
to_be_dropped = ['episodes','rating','members']

In [30]:
str_data = df.drop(to_be_dropped, axis=1)

In [31]:
str_data.head()

Unnamed: 0,name,genre,type,anime_name,anime_size,popularity
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,Kimi no Na wa,tiny,veryhigh
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,Fullmetal Alchemist: Brotherhood,medium,veryhigh
2,gintama,"[action, comedy, historical, parody, samur...",TV,Gintama,medium,veryhigh
3,steinsgate,"[sci-fi, thriller]",TV,Steins Gate,small,veryhigh
4,gintama039,"[action, comedy, historical, parody, samur...",TV,Gintama 039,medium,veryhigh


In [32]:
str_data['combined'] = ''
columns = str_data.columns
for index, row in str_data.iterrows():
    words = ''
    for col in columns:
        if col != 'genre' and col != 'anime_name':
            words = words + ' ' + row[col]
        elif col != 'anime_name':
            for genre in row[col]:
                words = words + ' ' + genre
    row['combined'] = words

In [33]:
str_data.head()

Unnamed: 0,name,genre,type,anime_name,anime_size,popularity,combined
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,Kimi no Na wa,tiny,veryhigh,kiminonawa drama romance school supernatur...
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,Fullmetal Alchemist: Brotherhood,medium,veryhigh,fullmetalalchemistbrotherhood action adventu...
2,gintama,"[action, comedy, historical, parody, samur...",TV,Gintama,medium,veryhigh,gintama action comedy historical parody s...
3,steinsgate,"[sci-fi, thriller]",TV,Steins Gate,small,veryhigh,steinsgate sci-fi thriller TV small veryhigh
4,gintama039,"[action, comedy, historical, parody, samur...",TV,Gintama 039,medium,veryhigh,gintama039 action comedy historical parody...


In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
# Using CountVectorizer to convert contents of 'combined' column into vectors
cv = CountVectorizer()

In [36]:
matrix = cv.fit_transform(str_data['combined'])

In [37]:
matrix

<12017x12033 sparse matrix of type '<class 'numpy.int64'>'
	with 88435 stored elements in Compressed Sparse Row format>

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
cosine_matrix = cosine_similarity(matrix)

In [40]:
cosine_matrix.shape

(12017, 12017)

In [42]:
# Creating new 'id' column for convenience 

In [43]:
str_data['id'] = [i for i in range(0,str_data.shape[0])]

In [44]:
# this function accepts a anime name as input as returns a list of ten most recommended animes 
def recommend_ten(title):
    title = re.sub(' ',''," ".join(re.findall('[a-zA-Z0-9]',title.lower())))
    anime_id = str_data[str_data.name==title]['id'].values[0]
    sorted_scores = sorted(list(enumerate(cosine_matrix[anime_id])), key=lambda x: x[1], reverse=True)
    sorted_ten = sorted_scores[1:11]
    top_ten = [str_data[anime[0]==str_data['id']]['anime_name'].values[0] for anime in sorted_ten]
    return top_ten

In [45]:
anime_name = 'Kimi no na wa'

In [46]:
recommendations = recommend_ten(anime_name)

In [47]:
recommendations

['Hotarubi no Mori e',
 'Aura: Maryuuin Kouga Saigo no Tatakai',
 'Kara no Kyoukai 5: Mujun Rasen',
 'Koe no Katachi',
 'Sen to Chihiro no Kamikakushi',
 'Kokoro ga Sakebitagatterunda ',
 'Clannad: After Story - Mou Hitotsu no Sekai  Kyou-hen',
 'Angel Beats : Another Epilogue',
 'Harmonie',
 'Air Movie']