# Import Dependencies

We begin by importing the necessary dependenciess for our analyses. 

In [1]:
# System
import os

# Data Analysis
import numpy as np
import pandas as pd

# Cosine Similarity
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Custom Scripts
from src.utils import check_nan

# Data Exploration

Next, we explore the data scraped through the [myanimelist.net](https://myanimelist.net) API by Kaggle user [Yonatan Rabinovich](https://www.kaggle.com/yonatanrabinovich).

In [2]:
# Print filepaths under data directory
for root, dirs, files in os.walk('data'):
    for name in files:
        print(os.path.join(root, name))

data/anime.csv
data/rating.csv


In [3]:
anime_path = 'data/anime.csv'
rating_path = 'data/rating.csv'

Notice that we have 2 csv files in our `data/` directory: 

* `anime.csv` contains **7 columns** with **12,294 entries**:

| Column Name | Description                                                                             |
|-------------|-----------------------------------------------------------------------------------------|
| `anime_id`  | [myanimelist.net](https://myanimelist.net) unique ID identifying an anime.              |
| `name`      | Full name of anime.                                                                     |
| `genre`     | Comma separated list of genres for this anime.                                          |
| `type`      | Movie, TV, OVA, etc.                                                                    |
| `episodes`  | Number of episodes in this anime (1 if movie).                                          |
| `rating`    | Average rating (out of 10) for this anime.                                              |
| `members`   | Number of community members that are in this anime's "group".                           |

* `rating.csv` contains **3 columns** with **7,813,737 entries**:

| Column Name | Description                                                                          |
|-------------|--------------------------------------------------------------------------------------|
| `user_id`   | Non-identifiable, randomly generated user ID.                                        |
| `anime_id`  | The anime that this user has rated.                                                  |
| `rating`    | Rating (out of 10) this user has assigned (-1 if the user watched without assigning).|


In [4]:
# Read in anime.csv
anime_df = pd.read_csv(anime_path)

# Display the first five rows of the dataframe
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
# Check the shape of the dataframe
anime_df.shape

(12294, 7)

In [6]:
# Print a concise summary of a dataframe
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [7]:
# Check for missing values in the dataframe
check_nan(anime_df)

Percentage of missing values:

rating      1.871
genre       0.504
type        0.203
anime_id    0.000
name        0.000
episodes    0.000
members     0.000
dtype: float64


There are several columns in `anime.csv` with missing values, which we'll have to handle accordingly later. For now, we repeat the same procedures for `rating.csv`.

In [8]:
# Read in rating.csv
rating_df = pd.read_csv(rating_path)

# Display the first five rows of the dataframe
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [9]:
# Check the shape of the dataframe
rating_df.shape

(7813737, 3)

In [10]:
# Print a concise summary of a dataframe
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [11]:
# Check for missing values in the dataframe
check_nan(rating_df)

Percentage of missing values:

user_id     0.0
anime_id    0.0
rating      0.0
dtype: float64


From an initial pass, it seems as though there are no missing values in `rating.csv`. However, recall from the column descriptions presented at the beginning of this section that an entry of `-1` in the `rating` column is equivalent to a `NaN` value. Hence, we'll foil them as such.

In [12]:
# Replace -1 with NaN along the rating column
rating_df['rating'] = rating_df['rating'].apply(lambda x: np.nan if x == -1 else x)

# Re-check for missing values
check_nan(rating_df)

Percentage of missing values:

rating      18.896
user_id      0.000
anime_id     0.000
dtype: float64


Incredibly, we find that almost a fifth of the entries in `rating.csv` have no user ratings!

# Handling Missing Values

Due to the small percentage of missing values in `anime.csv`, we opt to simply drop the rows with `NaN` entries.

In [13]:
# Remove the rows with missing values
anime_df.dropna(inplace=True)

# Ensure that all missing values are now removed
check_nan(anime_df)

Percentage of missing values:

anime_id    0.0
name        0.0
genre       0.0
type        0.0
episodes    0.0
rating      0.0
members     0.0
dtype: float64


We actually won't bother with handling the missing values in `rating.csv`, as we're planning to drop that column in the following section, since we're less interested in which animes a given user has rated, and more so with the average rating an anime has received from the wider [myanimelist.net](https://myanimelist.net) community.

# Feature Engineering

We want to engineer a dataframe that contains only the relevant info for building our recommendation engine.

In [14]:
# Create a new dataframe joining both rating_df and anime_df along the anime_id column
feature_df = rating_df.merge(anime_df, left_on='anime_id', right_on='anime_id', suffixes=['_user', '_average'])

# Drop the irrelevant columns (anime_id, and rating_user) from the merged Dataframe
feature_df.drop(['anime_id', 'rating_user'], axis=1, inplace=True)

# Display the first five rows of the dataframe
feature_df.head()

Unnamed: 0,user_id,name,genre,type,episodes,rating_average,members
0,1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


Next, further engineer a smaller dataframe containing only `user_id`, `name`, and `rating_average`, such that we can construct a pivot table to help simplify our computations of pairwise similarities within our feature space.

In [15]:
# Drop all the columns from feature_df except for user_id, name, and rating_average
rated_anime = feature_df[['user_id', 'name', 'rating_average']]

# Construct a pivot table from the resulting dataframe
pivot_table = rated_anime.pivot_table(index=['user_id'], columns=['name'], values='rating_average', dropna=True)

# Display the first five rows of the pivot table
pivot_table.head()

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,...,makemagic,"on-chan, Yume Power Daibouken!",s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,8.11,,,,,


Since cosine similarity models (which we'll revisit in greater detail in the next section) are sensitive to the mean of the features, one must normalize the values along each row of our table to obtain the mean-removed features. We then take the transpose of the table, before dropping the columns with no ratings (`NaN`).

In [16]:
# Normalization
pivot_norm = pivot_table.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=1)

# Transpose
pivot_norm = pivot_norm.T

# Remove missing values
pivot_norm.fillna(0, inplace=True)
pivot_norm = pivot_norm.loc[:, (pivot_norm != 0).any(axis=0)]

# Display the first five rows of the pivot table
pivot_norm.head()

user_id,1,2,3,4,5,6,7,8,10,11,...,73505,73506,73507,73508,73510,73511,73512,73513,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.085488,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Finally, we convert the feature engineered pivot table into compressed sparse row matrix format, with the goal of making the similarity computations more efficient.

In [17]:
# Convert pivot_norm to a csr_matrix
pivot_sparse = csr_matrix(pivot_norm.values)

# Check the shape of the csr_matrix
pivot_sparse.shape

(11161, 71116)

# Compute Pairwise Cosine Similarity

Mathematically, cosine similarity is a metric used to measure how similar two non-zero vectors are within an inner product space, by calculating the cosine of the angle between them in this higher dimensional space. 

$$\begin{equation*}
    \cos\theta=\frac{\vec{A}\cdot\vec{B}}{\left|\vec{A}\right|\left|\vec{B}\right|}=\frac{\sum_{i=1}^n A_i B_i}{\sqrt{\sum_{i=1}^n A_i^2}\sqrt{\sum_{i=1}^n A_i^2}}
\end{equation*}$$

This approach is optimal for our use-case, as even if two animes are far apart in our feature space by their Euclidean distance, their (normalized) feature vectors could still be oriented in similar directions.

In [18]:
# Fit a cosine similarity model to our data
model = cosine_similarity(pivot_sparse)

# Convert the results into a dataframe
results = pd.DataFrame(model, index=pivot_norm.index, columns=pivot_norm.index)

# Display the first five rows of the dataframe
results.head()

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,...,makemagic,"on-chan, Yume Power Daibouken!",s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,1.0,0.032303,-0.005298,-0.051021,-0.048446,0.085629,0.000785,-0.015884,0.008047,0.007808,...,0.042764,0.106917,-0.006135,0.157764,-0.025491,-0.019763,-0.03629,-0.031706,-0.030208,0.069487
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.032303,1.0,-0.004424,-0.023227,-0.018584,0.0178,-0.002353,-0.007931,-0.000616,0.000934,...,0.0,0.302129,-0.006249,0.0,-0.005924,-0.005974,-0.009515,-0.009636,-0.009606,0.0
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,-0.005298,-0.004424,1.0,0.351169,0.066565,-0.000599,0.027631,0.020844,0.031297,0.026553,...,-0.011708,-0.014642,0.008181,-0.015823,-0.096288,-0.103965,-0.098889,-0.101104,-0.113685,-0.009421
&quot;Bungaku Shoujo&quot; Memoire,-0.051021,-0.023227,0.351169,1.0,0.488984,-0.010535,-0.010293,0.022338,-0.01824,-0.011489,...,-0.049318,-0.076877,0.01242,-0.034417,0.023816,0.009172,0.043891,0.013443,0.022813,-0.028374
&quot;Bungaku Shoujo&quot; Movie,-0.048446,-0.018584,0.066565,0.488984,1.0,-0.011582,-0.022151,0.019156,-0.035066,-0.025242,...,-0.038649,-0.06151,0.013562,-0.026972,0.052638,0.04037,0.074123,0.050288,0.060639,-0.022236


# Evaluate Performance

Once we've computed the pairwise similarities between our features, we define a pipeline to obtain the top 5 animes that are most similar to a given input.

In [36]:
def inference(anime):
    '''
    This function will take an anime title as input, and print the top 5 animes with the highest cosine similarity value, as well as the percentage of similarity.

    Args:
    anime: The anime title you want to get recommendations for.
    ---------------------------------------------------------------------------------------------------------------------

    Example:
    >>>Input: pipeline('Dragon Ball Z')

    >>>Output: Since you watched Dragon Ball Z, we recommend:
                Dragon Ball, 79.32% match
                Fullmetal Alchemist, 42.81% match
                Death Note, 42.6% match
                Code Geass: Hangyaku no Lelouch, 37.64% match
                Yuu☆Yuu☆Hakusho, 37.39% match
    '''
    if anime in results:
        print(f'Since you watched {anime}, we recommend:\n')
        for rec in results.sort_values(by=anime, ascending=False).index[1:6]:
            print(f'{rec}: {round(results[rec][anime] * 100, 2)}% match')
    else:
        print('Anime title not found. Please check for typos, or perhaps use the anime\'s original (non-translated) name.')

In [34]:
inference('Naruto')

Since you watched Naruto, we recommend:

Sword Art Online: 29.19% match
Bleach: 28.06% match
Elfen Lied: 27.78% match
Ao no Exorcist: 26.8% match
Naruto: Shippuuden Movie 6 - Road to Ninja: 22.81% match


In [39]:
inference('Naruto: Shippuuden Movie 6 - Road to Ninja')

Since you watched Naruto: Shippuuden Movie 6 - Road to Ninja, we recommend:

The Last: Naruto the Movie: 43.64% match
Naruto: Shippuuden Movie 5 - Blood Prison: 34.41% match
Fairy Tail OVA: 27.49% match
Boruto: Naruto the Movie: 27.1% match
Bleach Movie 4: Jigoku-hen: 25.34% match
