In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
## Read data
data_path = './data'
df_anime = pd.read_csv(os.path.join(data_path, 'anime.csv'))
df_rating = pd.read_csv(os.path.join(data_path, 'rating.csv'))

In [3]:
df_anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
## Record of deleted rows for matching ratings
drop_anime_id = []

In [5]:
## Clean df_anime

## Record anime_id of rows with empty entries, then remove them
invalid_rows = np.where(pd.isnull(df_anime))[0]

for row in invalid_rows:
    drop_anime_id.append(df_anime.iloc[row]['anime_id'])
    
drop_anime_id = list(set(drop_anime_id))
df_anime = df_anime.drop(invalid_rows)
df_anime = df_anime.reset_index(drop=True)

df_anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12012,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12013,5543,Under World,Hentai,OVA,1,4.28,183
12014,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12015,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [6]:
## Build dictionary for genre

genre_dict = {}
genres_all = np.array(df_anime['genre'])

for i, genres in enumerate(genres_all):        
    genre_list = genres.replace(' ', '').split(',')
    
    for genre in genre_list:
        if genre not in genre_dict.keys():
            genre_dict.update({genre: 1})
        else:
            genre_dict[genre] += 1

genre_dict

{'Drama': 1977,
 'Romance': 1437,
 'School': 1176,
 'Supernatural': 1001,
 'Action': 2768,
 'Adventure': 2316,
 'Fantasy': 2242,
 'Magic': 747,
 'Military': 416,
 'Shounen': 1684,
 'Comedy': 4575,
 'Historical': 798,
 'Parody': 403,
 'Samurai': 146,
 'Sci-Fi': 2036,
 'Thriller': 86,
 'Sports': 533,
 'SuperPower': 451,
 'Space': 377,
 'SliceofLife': 1204,
 'Mecha': 929,
 'Music': 842,
 'Mystery': 485,
 'Seinen': 532,
 'MartialArts': 264,
 'Vampire': 100,
 'Shoujo': 594,
 'Horror': 362,
 'Police': 195,
 'Psychological': 226,
 'Demons': 287,
 'Ecchi': 628,
 'Josei': 52,
 'ShounenAi': 62,
 'Game': 177,
 'Dementia': 238,
 'Harem': 313,
 'Cars': 72,
 'Kids': 1598,
 'ShoujoAi': 54,
 'Hentai': 1133,
 'Yaoi': 38,
 'Yuri': 41}

In [7]:
## Apply one hot encoding to genre
num_anime = df_anime.shape[0]
num_genre = len(genre_dict)

genre_one_hot = np.zeros((num_anime, num_genre))
genre_idx_map = {key: value for value, key in enumerate(genre_dict.keys())}
genres_all = np.array(df_anime['genre'])

for i, genres in enumerate(genres_all):        
    genre_list = genres.replace(' ', '').split(',')
    
    for genre in genre_list:
        genre_one_hot[i, genre_idx_map[genre]] += 1
        
df_genre = pd.DataFrame(genre_one_hot)

genre_names = []
for genre_name in genre_dict.keys():
    genre_names.append('genre_' + genre_name)

df_genre.columns = genre_names

## Replace original column with encoded columns
df_anime = df_anime.drop('genre', axis=1)
df_anime = pd.concat([df_anime, df_genre], axis=1)
df_anime

Unnamed: 0,anime_id,name,type,episodes,rating,members,genre_Drama,genre_Romance,genre_School,genre_Supernatural,...,genre_ShounenAi,genre_Game,genre_Dementia,genre_Harem,genre_Cars,genre_Kids,genre_ShoujoAi,genre_Hentai,genre_Yaoi,genre_Yuri
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28977,Gintama°,TV,51,9.25,114262,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9253,Steins;Gate,TV,24,9.17,673572,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9969,Gintama&#039;,TV,51,9.16,151266,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,9316,Toushindai My Lover: Minami tai Mecha-Minami,OVA,1,4.15,211,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12013,5543,Under World,OVA,1,4.28,183,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12014,5621,Violence Gekiga David no Hoshi,OVA,4,4.88,219,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12015,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,OVA,1,4.98,175,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
## Record anime_id of inappropriate entries, then remove them
inappropriate_anime_id = []

is_inappropriate1 = (df_anime['genre_Hentai'] == 1)
is_inappropriate2 = (df_anime['genre_Ecchi'] == 1)

inappropriate_anime_id1 = df_anime[is_inappropriate1]['anime_id']
inappropriate_anime_id2 = df_anime[is_inappropriate2]['anime_id']

inappropriate_anime_id.extend(inappropriate_anime_id1)
inappropriate_anime_id.extend(inappropriate_anime_id2)
drop_anime_id.extend(inappropriate_anime_id)

inappropriate_rows = np.where(np.logical_or(is_inappropriate1, is_inappropriate2))[0]

df_anime = df_anime.drop(inappropriate_rows)

df_anime = df_anime.drop(columns=['genre_Hentai', 'genre_Ecchi'])
df_anime = df_anime.reset_index(drop=True)

In [9]:
df_anime

Unnamed: 0,anime_id,name,type,episodes,rating,members,genre_Drama,genre_Romance,genre_School,genre_Supernatural,...,genre_Josei,genre_ShounenAi,genre_Game,genre_Dementia,genre_Harem,genre_Cars,genre_Kids,genre_ShoujoAi,genre_Yaoi,genre_Yuri
0,32281,Kimi no Na wa.,Movie,1,9.37,200630,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64,9.26,793665,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28977,Gintama°,TV,51,9.25,114262,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9253,Steins;Gate,TV,24,9.17,673572,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9969,Gintama&#039;,TV,51,9.16,151266,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10251,2218,Fujimi 2-choume Koukyougakudan,OVA,1,5.72,4981,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10252,2169,Ice,OVA,3,5.68,5247,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
10253,2592,Kimera,OVA,1,5.29,3374,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10254,730,Houkago no Shokuinshitsu,OVA,2,5.23,4550,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
## Apply one hot encoding to type

## Strip white space while encoding
one_hot = pd.get_dummies(df_anime['type'].str.strip(), prefix='type')

## Replace original column with encoded columns
df_anime = df_anime.drop('type', axis=1)
df_anime = pd.concat([df_anime, one_hot], axis=1)

df_anime

Unnamed: 0,anime_id,name,episodes,rating,members,genre_Drama,genre_Romance,genre_School,genre_Supernatural,genre_Action,...,genre_Kids,genre_ShoujoAi,genre_Yaoi,genre_Yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,32281,Kimi no Na wa.,1,9.37,200630,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,64,9.26,793665,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
2,28977,Gintama°,51,9.25,114262,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
3,9253,Steins;Gate,24,9.17,673572,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
4,9969,Gintama&#039;,51,9.16,151266,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10251,2218,Fujimi 2-choume Koukyougakudan,1,5.72,4981,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0,0,0,1,0,0
10252,2169,Ice,3,5.68,5247,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0,0,0,1,0,0
10253,2592,Kimera,1,5.29,3374,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0,0,0,1,0,0
10254,730,Houkago no Shokuinshitsu,2,5.23,4550,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0,0,0,1,0,0


In [11]:
## Replace 'Unknown' in episodes with 500
df_anime['episodes'].replace({'Unknown': '500'}, inplace=True)

In [12]:
## Normalize numerical attributes
for column in ['episodes', 'rating', 'members']:
    df_anime[column] = pd.to_numeric(df_anime[column], downcast="float")
    df_anime[column] = (df_anime[column] - df_anime[column].min()) / (df_anime[column].max() - df_anime[column].min())

In [19]:
df_anime

Unnamed: 0,anime_id,name,episodes,rating,members,genre_Drama,genre_Romance,genre_School,genre_Supernatural,genre_Action,...,genre_Kids,genre_ShoujoAi,genre_Yaoi,genre_Yuri,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,32281,Kimi no Na wa.,0.000000,0.924370,0.197867,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,0.034673,0.911165,0.782769,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
2,28977,Gintama°,0.027518,0.909964,0.112683,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
3,9253,Steins;Gate,0.012658,0.900360,0.664323,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
4,9969,Gintama&#039;,0.027518,0.899160,0.149180,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10251,2218,Fujimi 2-choume Koukyougakudan,0.000000,0.486194,0.004901,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0,0,0,1,0,0
10252,2169,Ice,0.001101,0.481393,0.005163,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0,0,0,1,0,0
10253,2592,Kimera,0.000000,0.434574,0.003316,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0,0,0,1,0,0
10254,730,Houkago no Shokuinshitsu,0.000550,0.427371,0.004476,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0,0,0,1,0,0


In [13]:
## Save cleaned dataset
df_anime.to_csv(os.path.join(data_path, 'anime_cleaned.csv'), index=False)

In [14]:
df_rating

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [15]:
## Clean df_rating
rows_to_remove = np.where(df_rating['anime_id'].isin(drop_anime_id))[0]
df_rating = df_rating.drop(rows_to_remove)
df_rating = df_rating.reset_index(drop=True)
df_rating

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,226,-1
3,1,355,-1
4,1,356,-1
...,...,...,...
6757392,73515,14345,7
6757393,73515,16512,7
6757394,73515,17187,9
6757395,73515,22145,10


In [16]:
#df_rating[df_rating['rating'] == -1]
rows_to_remove = np.where(df_rating['rating'] == -1)[0]
df_rating = df_rating.drop(rows_to_remove)
df_rating = df_rating.reset_index(drop=True)
df_rating

Unnamed: 0,user_id,anime_id,rating
0,1,11757,10
1,2,11771,10
2,3,20,8
3,3,154,6
4,3,170,9
...,...,...,...
5498518,73515,14345,7
5498519,73515,16512,7
5498520,73515,17187,9
5498521,73515,22145,10


In [17]:
df_rating['rating'].value_counts()

8     1448453
7     1156937
9     1137902
10     875282
6      516130
5      223078
4       80552
3       31273
2       17004
1       11912
Name: rating, dtype: int64

In [18]:
## Save cleaned dataset
df_rating.to_csv(os.path.join(data_path, 'rating_cleaned.csv'), index=False)