### 3. Explore the data
1. Create a copy of the data for explorations (sampling it down to a manageable size if necessary)
2. Create a Jupyter notebook to keep a record of your data exploration
3. Study each feature and its characteristics:
    * Name
    * Type (categorical, int/float, bounded/unbounded, text, structured, etc)
    * Percentage of missing values
    * Check for outliers, rounding errors etc
4. For supervised learning tasks, identify the target(s)
5. Visualise the data
6. Study the correlations between features
7. Identify the promising transformations you may want to apply (e.g. convert skewed targets to normal via a log transformation)
8. Document what you have learned

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('AnimeList.csv')

df

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,background,premiered,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Inu x Boku SS was licensed by Sentai Filmworks...,Winter 2012,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ..."
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,,Spring 2007,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi..."
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,,Fall 2008,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ..."
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,Princess Tutu aired in two parts. The first pa...,Summer 2002,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ..."
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,,Fall 2012,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14473,26089,Gutchonpa Omoshiro Hanashi,,グッチョンパおもしろ話,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Unknown,5,Finished Airing,...,,Fall 1987,Unknown,"{'Other': [{'mal_id': 26087, 'type': 'anime', ...",,,,Kids,[],[]
14474,21525,Geba Geba Shou Time!,,ゲバゲバ笑タイム!,,https://myanimelist.cdn-dena.com/images/anime/...,OVA,Unknown,1,Finished Airing,...,,,,[],Studio Lotus,,,Comedy,[],[]
14475,37897,Godzilla: Hoshi wo Kuu Mono,,GODZILLA -星を喰う者-,"Godzilla Part 3, Godzilla: Eater of Stars",https://myanimelist.cdn-dena.com/images/anime/...,Movie,Other,1,Not yet aired,...,,,,"{'Prequel': [{'mal_id': 36816, 'type': 'anime'...",,,,"Action, Sci-Fi, Adventure, Fantasy",[],[]
14476,34193,Nippon Mukashibanashi: Sannen Netarou,,日本昔ばなし 三ねん寝太郎,,https://myanimelist.cdn-dena.com/images/anime/...,OVA,Other,1,Finished Airing,...,,,,[],,,,"Fantasy, Kids",[],[]


# Study data

## Start by removing columns with useless information as to predict ranking

In [35]:
# Start by cutting out useless columns

# id non saying info
# image_url no
# status, airing, aired_string, aired - we predict on upcoming anime
# 'favorites', 'popularity', 'rank', 'scored_by'- these are all metrics that are put on after being ranked
# Members ????

df_copy = df.copy()

column_names = df_copy.columns
print(column_names)

cols_to_drop = ['anime_id', 'image_url', 'status', 'airing', 'aired_string', 'aired', 'favorites', 'popularity', 'rank', 'scored_by', 'members']
df_copy = df_copy.drop(columns=cols_to_drop)
column_names = df_copy.columns
print(column_names)

df_copy


Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme'],
      dtype='object')
Index(['title', 'title_english', 'title_japanese', 'title_synonyms', 'type',
       'source', 'episodes', 'duration', 'rating', 'score', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme'],
      dtype='object')


Unnamed: 0,title,title_english,title_japanese,title_synonyms,type,source,episodes,duration,rating,score,background,premiered,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme
0,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,TV,Manga,12,24 min. per ep.,PG-13 - Teens 13 or older,7.63,Inu x Boku SS was licensed by Sentai Filmworks...,Winter 2012,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ..."
1,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,TV,Manga,26,24 min. per ep.,PG-13 - Teens 13 or older,7.89,,Spring 2007,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi..."
2,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",TV,Manga,51,24 min. per ep.,PG - Children,7.55,,Fall 2008,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ..."
3,Princess Tutu,Princess Tutu,プリンセスチュチュ,,TV,Original,38,16 min. per ep.,PG-13 - Teens 13 or older,8.21,Princess Tutu aired in two parts. The first pa...,Summer 2002,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ..."
4,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,TV,Manga,25,24 min. per ep.,PG-13 - Teens 13 or older,8.67,,Fall 2012,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14473,Gutchonpa Omoshiro Hanashi,,グッチョンパおもしろ話,,TV,Unknown,5,8 min. per ep.,G - All Ages,5.50,,Fall 1987,Unknown,"{'Other': [{'mal_id': 26087, 'type': 'anime', ...",,,,Kids,[],[]
14474,Geba Geba Shou Time!,,ゲバゲバ笑タイム!,,OVA,Unknown,1,25 min.,G - All Ages,4.60,,,,[],Studio Lotus,,,Comedy,[],[]
14475,Godzilla: Hoshi wo Kuu Mono,,GODZILLA -星を喰う者-,"Godzilla Part 3, Godzilla: Eater of Stars",Movie,Other,1,Unknown,R - 17+ (violence & profanity),0.00,,,,"{'Prequel': [{'mal_id': 36816, 'type': 'anime'...",,,,"Action, Sci-Fi, Adventure, Fantasy",[],[]
14476,Nippon Mukashibanashi: Sannen Netarou,,日本昔ばなし 三ねん寝太郎,,OVA,Other,1,40 min.,G - All Ages,6.00,,,,[],,,,"Fantasy, Kids",[],[]


## Study amount of NaN's

In [34]:
nan_counts = df_copy.isna().sum()
nan_counts

title                 0
title_english      8754
title_japanese       35
title_synonyms     5541
type                  0
source                0
episodes              0
aired                 0
duration              0
rating              544
score                 0
background        13421
premiered         10382
broadcast         10207
related               0
producer           6190
licensor          11105
studio             5934
genre                64
opening_theme         0
ending_theme          0
dtype: int64

## Lets look more closely at the 8 features with alot of NaNs

In [40]:
values_to_keep = ['title_english', 'title_synonyms', 'background', 'premiered', 'broadcast', 'producer', 'licensor', 'studio']

nan_df = df_copy[df_copy.columns.intersection(values_to_keep)]
nan_df

Unnamed: 0,title_english,title_synonyms,background,premiered,broadcast,producer,licensor,studio
0,Inu X Boku Secret Service,Youko x Boku SS,Inu x Boku SS was licensed by Sentai Filmworks...,Winter 2012,Fridays at Unknown,"Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production
1,My Bride is a Mermaid,The Inland Sea Bride,,Spring 2007,Unknown,"TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo
2,Shugo Chara!! Doki,"Shugo Chara Ninenme, Shugo Chara! Second Year",,Fall 2008,Unknown,"TV Tokyo, Sotsu",,Satelight
3,Princess Tutu,,Princess Tutu aired in two parts. The first pa...,Summer 2002,Fridays at Unknown,"Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker
4,Bakuman.,Bakuman Season 3,,Fall 2012,Unknown,"NHK, Shueisha",,J.C.Staff
...,...,...,...,...,...,...,...,...
14473,,,,Fall 1987,Unknown,,,
14474,,,,,,Studio Lotus,,
14475,,"Godzilla Part 3, Godzilla: Eater of Stars",,,,,,
14476,,,,,,,,


We can prob just remove these 2 title things as we already have title + title_japanese

Background has over 90% NaN and it doesn't look interesting

Premiered has alot of nans, maybe aired is better.

Broadcast looked fun but over 2/3 is NaN

Lets look at how many different producers/liscensors/studios there are to see if they can be used

In [46]:
print(nan_df['producer'].value_counts())
print(nan_df['licensor'].value_counts())
print(nan_df['studio'].value_counts())


producer
NHK                                                            427
Sanrio                                                         157
Pink Pineapple                                                 155
Bandai Visual                                                  117
Fuji TV                                                        102
                                                              ... 
Dentsu, Starchild Records, Imagica, Rakuonsha, King Records      1
Sotsu, Lantis, Warner Bros., Shogakukan                          1
Imagica, Mippei Eigeki Kiryuukan                                 1
Yomiko Advertising, Shueisha                                     1
Studio Lotus                                                     1
Name: count, Length: 3221, dtype: int64


In [47]:
print(nan_df['licensor'].value_counts())

licensor
Funimation                                                726
Sentai Filmworks                                          572
Media Blasters                                            208
ADV Films                                                 157
Viz Media                                                 134
                                                         ... 
Bandai Entertainment, Maiden Japan                          1
Viz Media, Flatiron Film Company                            1
Haoliners Animation League                                  1
ADV Films, Funimation, Kadokawa Pictures USA                1
Funimation, Sentai Filmworks, Geneon Entertainment USA      1
Name: count, Length: 193, dtype: int64


In [48]:
print(nan_df['studio'].value_counts())

studio
Toei Animation                         725
Sunrise                                447
J.C.Staff                              314
Madhouse                               311
Production I.G                         251
                                      ... 
Toei Animation, Bridge                   1
CUCURI, Digital Network Animation        1
Studio Unicorn                           1
Bee Train, Cookie Jar Entertainment      1
33 Collective                            1
Name: count, Length: 778, dtype: int64


Theres some big studios and a ton of small independent producers - maybe we can use the 10 top most studios, and set the rest below some threshhold to unknown, as to not give bias with low amounts of data (outliers we have to take care of)

### Cleaning up

In [52]:
df_copy = df.copy()

# keep aired instead of premiered
cols_to_drop = ['anime_id', 'image_url', 'status', 'airing', 'aired_string', 'broadcast',
                 'premiered', 'favorites', 'popularity', 'rank', 'scored_by', 'members', 'title_english', 'title_synonyms', 'background']

df_copy = df_copy.drop(columns=cols_to_drop)
column_names = df_copy.columns
print(column_names)

print(df_copy.isna().sum())
df_copy

Index(['title', 'title_japanese', 'type', 'source', 'episodes', 'aired',
       'duration', 'rating', 'score', 'related', 'producer', 'licensor',
       'studio', 'genre', 'opening_theme', 'ending_theme'],
      dtype='object')
title                 0
title_japanese       35
type                  0
source                0
episodes              0
aired                 0
duration              0
rating              544
score                 0
related               0
producer           6190
licensor          11105
studio             5934
genre                64
opening_theme         0
ending_theme          0
dtype: int64


Unnamed: 0,title,title_japanese,type,source,episodes,aired,duration,rating,score,related,producer,licensor,studio,genre,opening_theme,ending_theme
0,Inu x Boku SS,妖狐×僕SS,TV,Manga,12,"{'from': '2012-01-13', 'to': '2012-03-30'}",24 min. per ep.,PG-13 - Teens 13 or older,7.63,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ..."
1,Seto no Hanayome,瀬戸の花嫁,TV,Manga,26,"{'from': '2007-04-02', 'to': '2007-10-01'}",24 min. per ep.,PG-13 - Teens 13 or older,7.89,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi..."
2,Shugo Chara!! Doki,しゅごキャラ！！どきっ,TV,Manga,51,"{'from': '2008-10-04', 'to': '2009-09-25'}",24 min. per ep.,PG - Children,7.55,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ..."
3,Princess Tutu,プリンセスチュチュ,TV,Original,38,"{'from': '2002-08-16', 'to': '2003-05-23'}",16 min. per ep.,PG-13 - Teens 13 or older,8.21,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ..."
4,Bakuman. 3rd Season,バクマン。,TV,Manga,25,"{'from': '2012-10-06', 'to': '2013-03-30'}",24 min. per ep.,PG-13 - Teens 13 or older,8.67,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14473,Gutchonpa Omoshiro Hanashi,グッチョンパおもしろ話,TV,Unknown,5,"{'from': '1987-11-05', 'to': '1988-11-04'}",8 min. per ep.,G - All Ages,5.50,"{'Other': [{'mal_id': 26087, 'type': 'anime', ...",,,,Kids,[],[]
14474,Geba Geba Shou Time!,ゲバゲバ笑タイム!,OVA,Unknown,1,"{'from': '1986-03-21', 'to': '1986-03-21'}",25 min.,G - All Ages,4.60,[],Studio Lotus,,,Comedy,[],[]
14475,Godzilla: Hoshi wo Kuu Mono,GODZILLA -星を喰う者-,Movie,Other,1,"{'from': None, 'to': None}",Unknown,R - 17+ (violence & profanity),0.00,"{'Prequel': [{'mal_id': 36816, 'type': 'anime'...",,,,"Action, Sci-Fi, Adventure, Fantasy",[],[]
14476,Nippon Mukashibanashi: Sannen Netarou,日本昔ばなし 三ねん寝太郎,OVA,Other,1,"{'from': None, 'to': None}",40 min.,G - All Ages,6.00,[],,,,"Fantasy, Kids",[],[]


### Looking for outliers by ValueCounts on each feature

Checked also 'type', 'source', 'rating' - these seem fine
'genre' we should maybe encode 

In [57]:
print(df_copy['episodes'].value_counts()) # A ton of outliers - maybe we can use some range encoding

episodes
1      6857
2      1187
12     1034
13      637
3       558
       ... 
161       1
150       1
220       1
167       1
132       1
Name: count, Length: 196, dtype: int64


In [58]:
print(df_copy['duration'].value_counts()) # A ton of outliers, since theres a lot of movies - maybe we can use some range encoding

duration
24 min. per ep.          1513
25 min. per ep.           808
23 min. per ep.           797
30 min. per ep.           749
2 min.                    476
                         ... 
46 sec. per ep.             1
1 hr. 17 min. per ep.       1
49 sec. per ep.             1
2 hr. 19 min.               1
12 sec. per ep.             1
Name: count, Length: 301, dtype: int64


In [62]:
print(df_copy['score'].value_counts()) # Something needs to be done here
print(df_copy['score'].sum() / df_copy['score'].count()) # Average score is 6.14

score
0.00    354
6.00    120
5.00    100
6.50     90
6.33     87
       ... 
3.17      1
9.14      1
9.20      1
3.12      1
1.33      1
Name: count, Length: 630, dtype: int64
6.142481696366902


In [64]:
print(df_copy['related'][0]) # could be interesting if we wanted to look at the related media has any influence on score

{'Adaptation': [{'mal_id': 17207, 'type': 'manga', 'url': 'https://myanimelist.net/manga/17207/Inu_x_Boku_SS', 'title': 'Inu x Boku SS'}], 'Sequel': [{'mal_id': 13403, 'type': 'anime', 'url': 'https://myanimelist.net/anime/13403/Inu_x_Boku_SS_Special', 'title': 'Inu x Boku SS Special'}]}


In [93]:
print(df_copy['opening_theme'])

def check_value(value):
    if len(value) <= 2: # 2 charcters is empty array huh
        return True
    else:
        return False

count = df_copy['opening_theme'].apply(check_value).sum()
print(count)

0                                    ['"Nirvana" by MUCC']
1                       ['"Romantic summer" by SUN&LUNAR']
2        ['#1: "Minna no Tamago (みんなのたまご)" by Shugo Cha...
3                   ['"Morning Grace" by Ritsuko Okazaki']
4        ['#1: "Moshimo no Hanashi (もしもの話)" by nano.RIP...
                               ...                        
14473                                                   []
14474                                                   []
14475                                                   []
14476                                                   []
14477                                                   []
Name: opening_theme, Length: 14478, dtype: object
9784
