# Import Dependencies

We begin by importing the necessary dependenciess for our analyses. 

In [1]:
# System
import os

# Data Analysis
import numpy as np
import pandas as pd

# Custom Scripts
from src.utils import check_nan

# Data Exploration

Next, we explore the data scraped through the [myanimelist.net](https://myanimelist.net) API by Kaggle user [Yonatan Rabinovich](https://www.kaggle.com/yonatanrabinovich).

In [2]:
# Print filepaths under data directory
for root, dirs, files in os.walk('data'):
    for name in files:
        print(os.path.join(root, name))

data/anime.csv
data/rating.csv


In [3]:
anime_path = 'data/anime.csv'
rating_path = 'data/rating.csv'

Notice that we have 2 csv files in our `data/` directory: 

* `anime.csv` contains **7 columns** with **12,294 entries**:

| Column Name | Description                                                                             |
|-------------|-----------------------------------------------------------------------------------------|
| `anime_id`  | [myanimelist.net](https://myanimelist.net) unique ID identifying an anime.              |
| `name`      | Full name of anime.                                                                     |
| `genre`     | Comma separated list of genres for this anime.                                          |
| `type`      | Movie, TV, OVA, etc.                                                                    |
| `episodes`  | Number of episodes in this anime (1 if movie).                                          |
| `rating`    | Average rating (out of 10) for this anime.                                              |
| `members`   | Number of community members that are in this anime's "group".                           |

* `rating.csv` contains **3 columns** with **7,813,737 entries**:

| Column Name | Description                                                                          |
|-------------|--------------------------------------------------------------------------------------|
| `user_id`   | Non-identifiable, randomly generated user ID.                                        |
| `anime_id`  | The anime that this user has rated.                                                  |
| `rating`    | Rating (out of 10) this user has assigned (-1 if the user watched without assigning).|


In [4]:
# Read in anime.csv
anime_df = pd.read_csv(anime_path)
# Display the first five rows of the dataframe
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
# Check the shape of the dataframe
anime_df.shape

(12294, 7)

In [6]:
# Print a concise summary of a dataframe
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [7]:
# Check for missing values in the dataframe
check_nan(anime_df)

Percentage of missing values:

rating      1.871
genre       0.504
type        0.203
anime_id    0.000
name        0.000
episodes    0.000
members     0.000
dtype: float64


Notice that there are several columns in `anime.csv` with missing values, which we'll have to handle accordingly later. 

We repeat the same procedures for `rating.csv`.

In [8]:
# Read in rating.csv
rating_df = pd.read_csv(rating_path)
# Display the first five rows of the dataframe
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [9]:
# Check the shape of the dataframe
rating_df.shape

(7813737, 3)

In [10]:
# Print a concise summary of a dataframe
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [11]:
# Check for missing values in the dataframe
check_nan(rating_df)

Percentage of missing values:

user_id     0.0
anime_id    0.0
rating      0.0
dtype: float64


From an initial pass, it seems as though there are no missing values in `rating.csv`. However, recall from the column descriptions presented at the beginning of this section that an entry of `-1` in the `rating` column is equivalent to a `NaN` value. Hence, we'll foil them as such.

In [12]:
# Replace -1 with NaN along the rating column
rating_df['rating'] = rating_df['rating'].apply(lambda x: np.nan if x == -1 else x)
# Re-check for missing values
check_nan(rating_df)

Percentage of missing values:

rating      18.896
user_id      0.000
anime_id     0.000
dtype: float64


Incredibly, we find that almost a fifth of the entries in `rating.csv` have no user ratings!

# Handling Missing Values

Due to the small percentage of missing values in `anime.csv`, we opt to simply drop the rows with `NaN` entries.

In [13]:
# Remove the rows with missing values
anime_df.dropna(inplace=True)
# Ensure that all missing values are now removed
check_nan(anime_df)

Percentage of missing values:

anime_id    0.0
name        0.0
genre       0.0
type        0.0
episodes    0.0
rating      0.0
members     0.0
dtype: float64


We actually won't bother with handling the missing values in `rating.csv`, as we're planning to drop that column in the following section, since we're less interested in which animes a given user has rated, and more so with the average rating an anime has received from the wider [myanimelist.net](https://myanimelist.net) community.

# Feature Engineering

We want to engineer a dataframe that contains only the relevant info for building our recommendation engine.

In [18]:
# Create a new dataframe joining both anime_df and rating_df along the anime_id column
feature_df = rating_df.merge(anime_df, left_on='anime_id', right_on='anime_id', suffixes=['_user', ''])

# Drop the irrelevant columns from the merged Dataframe (anime_id, and rating_user)
feature_df.drop(['anime_id'], axis=1, inplace=True)

# Display the first five rows of the dataframe
feature_df.head()

Unnamed: 0,user_id,rating_user,name,genre,type,episodes,rating,members
0,1,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,8.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,6.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [19]:
# Count the number of unique entries along the name column
num_anime = feature_df['name'].nunique()
print(f'Total number of unique animes: {num_anime}')

Total number of unique animes: 11161


Next, further engineer a smaller dataframe containing only `user_id`, `name`, and `rating`, such that we can construct a pivot table to help simplify our computations of pairwise similarities within our feature space.

In [23]:
# Drop all the columns from feature_df except for user_id, name, and rating
rated_anime = feature_df[['user_id', 'name', 'rating']]

# Construct a pivot table from our dataframe
pivot = rated_anime.pivot_table(index=['user_id'], columns=['name'], values='rating')

# Display the first five rows of the pivot table
pivot.head()

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,...,makemagic,"on-chan, Yume Power Daibouken!",s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,8.11,,,,,


Notice that there are new missing values introduced into our pivot table, arising from the simple fact that not **every** user has rated **every** anime in our dataset.