## Import Dependencies

In [212]:
import os
import numpy as np
import pandas as pd

## Data Exploration

In [213]:
# Print filepaths under data directory
for root, dirs, files in os.walk('data'):
    for name in files:
        print(os.path.join(root, name))

data/anime.csv
data/rating.csv


In [214]:
anime_path = 'data/anime.csv'
rating_path = 'data/rating.csv'

In [215]:
# Define function to check for percentage of missing values
def check_nan(df):
    print(f'Percentage of missing values in rating.csv:\n\n{round(df.isnull().sum().sort_values(ascending=False) / len(df.index), 4) * 100}')

The rating dataset (`rating.csv`) contains 3 columns and 7,813,737 entries: 

* `user_id`: non identifiable randomly generated user id.
* `anime_id`: the anime that this user has rated.
* `rating`: rating out of 10 this user has assigned (-1 if the user watched without assigning).

In [216]:
rating_df = pd.read_csv(rating_path)
rating_df

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [217]:
rating_df.shape

(7813737, 3)

In [218]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [219]:
check_nan(rating_df)

Percentage of missing values in rating.csv:

user_id     0.0
anime_id    0.0
rating      0.0
dtype: float64


While there are no missing values in this dataset, an entry of -1 in the rating column is equivalent to a `NaN` value, so we'll foil them as such.

In [220]:
rating_df['rating'] = rating_df['rating'].apply(lambda x: np.nan if x == -1 else x)
rating_df

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,
...,...,...,...
7813732,73515,16512,7.0
7813733,73515,17187,9.0
7813734,73515,22145,10.0
7813735,73516,790,9.0


The anime dataset (`anime.csv`) contains 7 columns and 12,294 entries: 

* `anime_id`: myanimelist.net's unique id identifying an anime.
* `name`: full name of anime.
* `genre`: comma separated list of genres for this anime.
* `type`: movie, TV, OVA, etc.
* `episodes`: how many episodes in this show. (1 if movie).
* `rating`: average rating out of 10 for this anime.
* `members`: number of community members that are in this anime's "group".

In [221]:
anime_df = pd.read_csv(anime_path)
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [222]:
anime_df.shape

(12294, 7)

In [223]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


There are several columns with missing values, which we'll have to handle accordingly.

In [224]:
check_nan(anime_df)

Percentage of missing values in rating.csv:

rating      1.87
genre       0.50
type        0.20
anime_id    0.00
name        0.00
episodes    0.00
members     0.00
dtype: float64


Due to the small percentage of missing values, we opt to simply drop the rows with `NaN` entries.

In [225]:
anime_df.dropna(inplace=True)
check_nan(anime_df)

Percentage of missing values in rating.csv:

anime_id    0.0
name        0.0
genre       0.0
type        0.0
episodes    0.0
rating      0.0
members     0.0
dtype: float64


In [226]:
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


## Feature Engineering

In [227]:
# Create a new Dataframe combining both anime and rating via the anime_id column
rated_anime = rating_df.merge(anime_df, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])

# Drop the irrelevant columns from the merged Dataframe
rated_anime.drop(['anime_id', 'rating_user'], axis=1, inplace=True)
rated_anime

Unnamed: 0,user_id,name,genre,type,episodes,rating,members
0,1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
...,...,...,...,...,...,...,...
7813606,65682,Dr. Slump: Hoyoyo! Arale no Himitsu Dai Koukai...,"Comedy, Sci-Fi, Shounen",Special,1,6.17,248
7813607,69497,Shiroi Zou,"Action, Historical, Kids",Movie,1,4.71,45
7813608,70463,Kakinoki Mokkii,"Fantasy, Kids",Special,1,4.33,61
7813609,72404,Hashiri Hajimeta bakari no Kimi ni,Music,Music,1,6.76,239


In [182]:
# Count the number of unique entries in the name column
rated_anime['name'].nunique()

11193