# Data Exploration on the Anime Dataset
#### By Matthew Gerardino
---

In [1]:
#imports
import pandas as pd

In [2]:
#reading in data
url='https://raw.githubusercontent.com/woz-u/DS-Student-Resources/main/DS105-Intermediate-Statistics/Data/anime.csv'

df = pd.read_csv(url)

In [3]:
#print shape of data
print('The shape of your data is', df.shape)

The shape of your data is (6668, 33)


In [4]:
#view data
df.head()

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,duration_min,aired_from_year
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ...",24.0,2012
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi...",24.0,2007
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ...",24.0,2008
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ...",16.0,2002
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)...",24.0,2012


In [5]:
#get unique values in source column
df.source.unique()

array(['Manga', 'Original', 'Light novel', '4-koma manga', 'Novel',
       'Visual novel', 'Other', 'Game', 'Picture book', 'Card game',
       'Web manga', 'Book', 'Music', 'Radio', 'Digital manga'],
      dtype=object)

In [6]:
#get number of unique values in source column
df.source.nunique()

15

In [7]:
#recode using replace() function
df.replace(['Manga', '4-koma manga', 'Web manga', 'Digital manga'], 'Manga', inplace =True)
df.replace(['Light novel', 'Novel', 'Visual novel', 'Picture book', 'Book'], 'Book', inplace =True)
df.replace(['Game', 'Card game'], 'Game', inplace =True)
df.replace(['Original', 'Other', 'Music', 'Radio'], 'Listening', inplace =True)

In [8]:
#get unique values in source column after recode
df.source.unique()

array(['Manga', 'Listening', 'Book', 'Game'], dtype=object)

In [9]:
#get number of unique values in source column after recode
df.source.nunique()

4

In [10]:
#view null/missing values in each column
df.isnull().sum(axis = 0)

anime_id              0
title                 0
title_english      3230
title_japanese        5
title_synonyms     2187
image_url             2
type                  0
source                0
episodes              0
status                0
airing                0
aired_string          0
aired                 0
duration              0
rating                0
score                 0
scored_by             0
rank                356
popularity            0
members               0
favorites             0
background         5855
premiered          3702
broadcast          3688
related               0
producer           2266
licensor           3881
studio                0
genre                 4
opening_theme         0
ending_theme          0
duration_min          0
aired_from_year       0
dtype: int64

In [11]:
#view only columns with null/missing values
[col for col in df.columns if df[col].isnull().any()]

['title_english',
 'title_japanese',
 'title_synonyms',
 'image_url',
 'rank',
 'background',
 'premiered',
 'broadcast',
 'producer',
 'licensor',
 'genre']

In [12]:
#print total number of rows
print('total number of rows are:', df.shape[0])

total number of rows are: 6668


In [13]:
#print total number of rows in 'background' column
print('total number of missing values in the "background" column is:', df['background'].isnull().sum())

total number of missing values in the "background" column is: 5855


In [14]:
#name necessary variables
total_rows = df.shape[0]
max_null = df['background'].isnull().sum()

In [15]:
#subtract null values in 'background' column from total rows
rows_after = int(total_rows) - int(max_null)

print('Your total rows left over would be', rows_after)

Your total rows left over would be 813


In [16]:
#view percentage lost in data
data_lost = round(int(max_null)/int(total_rows)*100, 2)

print(f'If we drop our NAs we will lose {data_lost}% of our data, it is best not to drop our NAs in this case.')

If we drop our NAs we will lose 87.81% of our data, it is best not to drop our NAs in this case.
