# Data Cleaning and Preprocessing

In [46]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [47]:
file_path = [
    'dataset/anime.csv',
    'dataset/rankings/ranking_airing.csv'
    # ADD the remaining ranking csv files here!!!
]

---

# Exploring and Cleaning `anime.csv` first

In [48]:
data = pd.read_csv(file_path[0])
pd.set_option('display.max_columns', None)
data.head(n=3)

Unnamed: 0,id,title,main_picture,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,background,studios,statistics
0,95,Turn A Gundam,{'medium': 'https://api-cdn.myanimelist.net/im...,1999-04-09,2000-04-14,"It is the Correct Century, two millennia after...",7.71,1049,2892,40743,13338,white,2005-11-11T13:54:05+00:00,2022-03-22T03:37:48+00:00,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",50,"{'year': 1999, 'season': 'spring'}","{'day_of_the_week': 'friday', 'start_time': '1...",original,1445,pg_13,,"[{'id': 14, 'name': 'Sunrise'}, {'id': 1260, '...","{'status': {'watching': '2735', 'completed': '..."
1,3665,Ginga Eiyuu Densetsu Gaiden (1999),{'medium': 'https://api-cdn.myanimelist.net/im...,1999-12-24,2000-07-21,Ginga Eiyuu Densetsu Gaiden (1999) is the seco...,8.07,472,4347,17849,6478,white,2008-01-02T21:05:12+00:00,2022-03-18T19:39:11+00:00,ova,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",28,"{'year': 1999, 'season': 'fall'}",,novel,1560,r,,"[{'id': 8, 'name': 'Artland'}, {'id': 207, 'na...","{'status': {'watching': '814', 'completed': '8..."
2,2471,Doraemon (1979),{'medium': 'https://api-cdn.myanimelist.net/im...,1979-04-02,2005-03-18,Nobita Nobi is a normal fourth grade student. ...,7.74,976,2553,51255,23826,white,2007-05-11T22:02:20+00:00,2022-03-23T06:24:35+00:00,tv,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1787,"{'year': 1979, 'season': 'spring'}",,manga,660,pg,,"[{'id': 247, 'name': 'Shin-Ei Animation'}]","{'status': {'watching': '4637', 'completed': '..."


---

# 1. Handling missing values & Unused Features

In [49]:
# Check missing values in data
print("Missing values in dataframe:")
print(data.isnull().sum())
print("data shape: ", data.shape)

Missing values in dataframe:
id                             0
title                          0
main_picture                   0
start_date                     0
end_date                     876
synopsis                     128
mean                         116
rank                           0
popularity                     0
num_list_users                 0
num_scoring_users              0
nsfw                           0
created_at                     0
updated_at                     0
media_type                     0
status                         0
genres                         8
num_episodes                   0
start_season                   0
broadcast                   5372
source                       901
average_episode_duration       0
rating                        61
background                  7067
studios                        0
statistics                     0
dtype: int64
data shape:  (8777, 26)


## 1a. Dropping Unused Features

- We are dropping main_picture which is the profile picture of the anime, creat_date and update_date which is the create and update dates of the anime's MAL page, and background which is the background of the anime. 
- Because anime photo, site creation and update date does not provide useful information about the anime itself, and there are too many null values (7067/8777) for background.

In [50]:
# Drop columns: main_picture, created_at, updated_at, background
data_clean = data.drop(columns=['main_picture', 'created_at', 'updated_at', 'background'])
data_clean.head(n=3)

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,studios,statistics
0,95,Turn A Gundam,1999-04-09,2000-04-14,"It is the Correct Century, two millennia after...",7.71,1049,2892,40743,13338,white,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",50,"{'year': 1999, 'season': 'spring'}","{'day_of_the_week': 'friday', 'start_time': '1...",original,1445,pg_13,"[{'id': 14, 'name': 'Sunrise'}, {'id': 1260, '...","{'status': {'watching': '2735', 'completed': '..."
1,3665,Ginga Eiyuu Densetsu Gaiden (1999),1999-12-24,2000-07-21,Ginga Eiyuu Densetsu Gaiden (1999) is the seco...,8.07,472,4347,17849,6478,white,ova,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",28,"{'year': 1999, 'season': 'fall'}",,novel,1560,r,"[{'id': 8, 'name': 'Artland'}, {'id': 207, 'na...","{'status': {'watching': '814', 'completed': '8..."
2,2471,Doraemon (1979),1979-04-02,2005-03-18,Nobita Nobi is a normal fourth grade student. ...,7.74,976,2553,51255,23826,white,tv,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1787,"{'year': 1979, 'season': 'spring'}",,manga,660,pg,"[{'id': 247, 'name': 'Shin-Ei Animation'}]","{'status': {'watching': '4637', 'completed': '..."


In [51]:
data_clean.shape

(8777, 22)

## 1b. Filling in NaN values with domain-specific values

In [52]:
# No synopsis information available
data_clean["synopsis"].fillna(value = "no_Synopsis", inplace = True)

# Anime still airing/ongoing
data_clean["end_date"].fillna(value = "airing", inplace = True)

# Anime not broadcasted, replace missing value with the same formet for easy sorting
data_clean["broadcast"].fillna(value = "{'day_of_the_week': 'NIL', 'start_time': 'NIL'}", inplace = True)

# Source not known
data_clean["source"].fillna(value = "unknown", inplace = True)

# Not rated
data_clean["rating"].fillna(value = "no_rating", inplace = True)

# No genre information
data_clean["genres"].fillna(value = "[{'id': -1, 'name': 'no_genre'}]", inplace = True)

# Animes that do not have enough user giving their scorings, so replace null with value -1
data_clean["mean"].fillna(value = "-1", inplace = True)
#data_clean = data_clean[~np.isnan(data["mean"])]

# Check null values after cleaning
data_clean.isnull().sum()

id                          0
title                       0
start_date                  0
end_date                    0
synopsis                    0
mean                        0
rank                        0
popularity                  0
num_list_users              0
num_scoring_users           0
nsfw                        0
media_type                  0
status                      0
genres                      0
num_episodes                0
start_season                0
broadcast                   0
source                      0
average_episode_duration    0
rating                      0
studios                     0
statistics                  0
dtype: int64

In [53]:
data_clean.head()

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,studios,statistics
0,95,Turn A Gundam,1999-04-09,2000-04-14,"It is the Correct Century, two millennia after...",7.71,1049,2892,40743,13338,white,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",50,"{'year': 1999, 'season': 'spring'}","{'day_of_the_week': 'friday', 'start_time': '1...",original,1445,pg_13,"[{'id': 14, 'name': 'Sunrise'}, {'id': 1260, '...","{'status': {'watching': '2735', 'completed': '..."
1,3665,Ginga Eiyuu Densetsu Gaiden (1999),1999-12-24,2000-07-21,Ginga Eiyuu Densetsu Gaiden (1999) is the seco...,8.07,472,4347,17849,6478,white,ova,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",28,"{'year': 1999, 'season': 'fall'}","{'day_of_the_week': 'NIL', 'start_time': 'NIL'}",novel,1560,r,"[{'id': 8, 'name': 'Artland'}, {'id': 207, 'na...","{'status': {'watching': '814', 'completed': '8..."
2,2471,Doraemon (1979),1979-04-02,2005-03-18,Nobita Nobi is a normal fourth grade student. ...,7.74,976,2553,51255,23826,white,tv,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1787,"{'year': 1979, 'season': 'spring'}","{'day_of_the_week': 'NIL', 'start_time': 'NIL'}",manga,660,pg,"[{'id': 247, 'name': 'Shin-Ei Animation'}]","{'status': {'watching': '4637', 'completed': '..."
3,21,One Piece,1999-10-20,airing,"Gol D. Roger was known as the ""Pirate King,"" t...",8.63,66,26,1812581,1020274,white,tv,currently_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",0,"{'year': 1999, 'season': 'fall'}","{'day_of_the_week': 'sunday', 'start_time': '0...",manga,1440,pg_13,"[{'id': 18, 'name': 'Toei Animation'}]","{'status': {'watching': '1227452', 'completed'..."
4,2397,Digimon Adventure: Bokura no War Game!,2000-03-04,2000-03-04,This movie takes place after the Adventure ser...,7.77,924,2135,70125,43599,white,movie,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1,"{'year': 2000, 'season': 'winter'}","{'day_of_the_week': 'NIL', 'start_time': 'NIL'}",original,2460,pg,"[{'id': 18, 'name': 'Toei Animation'}]","{'status': {'watching': '653', 'completed': '6..."


---


# 2. Cleaning features by converting into json and splitting into different columns

In [54]:
# Import library for json manipulation
import json

In [55]:
# Splitting start_season column into individual year and season columns
def split_start_season(data_clean):
    # create NaN columns
    data_clean['start_season_year'] = np.nan
    data_clean['start_season_season'] = np.nan

    
    for row in range(0,len(data_clean)):
        #convert from string to json
        start_season = (json.loads(data_clean['start_season'][row].replace("'", "\"")))
        year = start_season['year']
        season = start_season['season']

        data_clean['start_season_year'][row] = year
        data_clean['start_season_season'][row] = season
    
    
    # drop original column
    data_clean.drop(columns=['start_season'], inplace=True)
    
    
    return data_clean

In [56]:
# Splitting broadcast column into individual day and time columns
def split_broadcast(data_clean):
    # create NaN columns
    data_clean['broadcast_day_of_the_week'] = np.nan
    data_clean['broadcast_start_time'] = np.nan

    
    for row in range(0,len(data_clean)):
        #convert from string to json
        broadcast = (json.loads(data_clean['broadcast'][row].replace("'", "\"")))


        data_clean['broadcast_day_of_the_week'][row] = broadcast['day_of_the_week']

        try:
            data_clean['broadcast_start_time'][row] = broadcast['start_time']
        except:
            data_clean['broadcast_start_time'][row] = 'NIL'
    
    
    # drop original column
    data_clean.drop(columns=['broadcast'], inplace=True)
    
    
    return data_clean

In [57]:
# Splitting statistics column into watching, completed, on hold, plan to watch and num of user columns

def split_statistics(data_clean):
    # create NaN columns
    data_clean['statistics_watching'] = np.nan
    data_clean['statistics_completed'] = np.nan
    data_clean['statistics_on_hold'] = np.nan
    data_clean['statistics_dropped'] = np.nan
    data_clean['statistics_plan_to_watch'] = np.nan
    data_clean['statistics_num_list_users'] = np.nan


    for row in range(0,len(data_clean)):
        # convert from string to json
        statistics = (json.loads(data_clean['statistics'][row].replace("'", "\"")))

        data_clean['statistics_watching'][row] = statistics['status']['watching']
        data_clean['statistics_completed'][row] = statistics['status']['completed']
        data_clean['statistics_on_hold'][row] = statistics['status']['on_hold']
        data_clean['statistics_dropped'][row] = statistics['status']['dropped']
        data_clean['statistics_plan_to_watch'][row] = statistics['status']['plan_to_watch']
        data_clean['statistics_num_list_users'][row] = statistics['num_list_users']
    
    
    # drop original column
    data_clean.drop(columns=['statistics'], inplace=True)
    
    
    return data_clean

In [58]:
# Convert studios into json format
def json_studios(data_clean):
    # Convert studios string to json
    for row in range(0, len(data_clean)):
        try:
            studios = (json.loads(data_clean['studios'][row].replace("'", "\"")))
        except:
            studios = (json.loads(data_clean['studios'][row].replace("'", "\"").replace("\"s", '\'s').replace('N\"', "N\'")))

        data_clean['studios'][row] = studios
    
    return data_clean

In [59]:
# Convert genres into json format
def json_genres(data_clean):
    #Convert genres string to json
    for row in range(0, len(data_clean)):
        genres = json.loads(data_clean['genres'][row].replace("'", "\""))

        data_clean['genres'][row] = genres
        
    return data_clean

**Summarising Statistics Feature:**
- Let's look at the various `statistics` to compare the viewership statistics:
  - statistics_watching
  - statistics_completed
  - statistics_on_hold
  - statistics_dropped
  - statistics_plan_to_watch
  - **statistics_num_list_users** (equal to sum of the previous statistics)
  
- We shall split the statistics into 2 groups to group the **positive/negative sentiment of the viewership statistics**:
  1. `Positive`
    - statistics_watching + statistics_completed + statistics_plan_to_watch
  2. `Negative`
    - statistics_on_hold + statistics_dropped

In [61]:
# Create percentage fraction positive/negative viewerships --> Range: [0, 1]
# *function to be called after split_statistics() function*

def get_pos_neg_viewership(anime, viewership_types_list):
    # single anime
    total_pos_neg_views = 0

    for viewership_type in viewership_types_list:
        total_pos_neg_views += data_clean[viewership_type][anime]
    
    return total_pos_neg_views

def create_viewership_fraction(data_clean):
    # create NaN columns
    data_clean['positive_viewership_fraction'] = np.nan
    data_clean['negative_viewership_fraction'] = np.nan
    
    positive_viewership = [
        'statistics_watching',
        'statistics_completed',
        'statistics_plan_to_watch'
    ]
    negative_viewership = [
        'statistics_on_hold',
        'statistics_dropped'
    ]
    
    for anime in range(0, len(data_clean)):
        total_views = data_clean['statistics_num_list_users'][anime]
        
        # calulating the total postive and total negative views respectively
        total_pos_views = get_pos_neg_viewership(anime, positive_viewership)
        total_neg_views = get_pos_neg_viewership(anime, negative_viewership)
        
        # calculate percentage fraction & create a new column
        data_clean['positive_viewership_fraction'][anime] = round(total_pos_views/total_views, 4)
        data_clean['negative_viewership_fraction'][anime] = round(total_neg_views/total_views, 4)
    
    return data_clean

In [62]:
# convert to json and split features
data_clean = split_start_season(data_clean)
data_clean = split_broadcast(data_clean)
data_clean = split_statistics(data_clean)
data_clean = json_studios(data_clean)
data_clean = json_genres(data_clean)
data_clean = create_viewership_fraction(data_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['start_season_year'][row] = year
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['start_season_season'][row] = season
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['broadcast_day_o

---

# 3. Convert `data_clean` to `csv file`

In [63]:
data_clean.head()

Unnamed: 0,id,title,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,media_type,status,genres,num_episodes,source,average_episode_duration,rating,studios,start_season_year,start_season_season,broadcast_day_of_the_week,broadcast_start_time,statistics_watching,statistics_completed,statistics_on_hold,statistics_dropped,statistics_plan_to_watch,statistics_num_list_users,positive_viewership_fraction,negative_viewership_fraction
0,95,Turn A Gundam,1999-04-09,2000-04-14,"It is the Correct Century, two millennia after...",7.71,1049,2892,40743,13338,white,tv,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",50,original,1445,pg_13,"[{'id': 14, 'name': 'Sunrise'}, {'id': 1260, '...",1999.0,spring,friday,17:00,2735.0,16661.0,2538.0,1597.0,17292.0,40823.0,0.8987,0.1013
1,3665,Ginga Eiyuu Densetsu Gaiden (1999),1999-12-24,2000-07-21,Ginga Eiyuu Densetsu Gaiden (1999) is the seco...,8.07,472,4347,17849,6478,white,ova,finished_airing,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'...",28,novel,1560,r,"[{'id': 8, 'name': 'Artland'}, {'id': 207, 'na...",1999.0,fall,NIL,NIL,814.0,8080.0,638.0,293.0,8048.0,17873.0,0.9479,0.0521
2,2471,Doraemon (1979),1979-04-02,2005-03-18,Nobita Nobi is a normal fourth grade student. ...,7.74,976,2553,51255,23826,white,tv,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1787,manga,660,pg,"[{'id': 247, 'name': 'Shin-Ei Animation'}]",1979.0,spring,NIL,NIL,4637.0,29125.0,4569.0,7816.0,5227.0,51374.0,0.7589,0.2411
3,21,One Piece,1999-10-20,airing,"Gol D. Roger was known as the ""Pirate King,"" t...",8.63,66,26,1812581,1020274,white,tv,currently_airing,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'...",0,manga,1440,pg_13,"[{'id': 18, 'name': 'Toei Animation'}]",1999.0,fall,sunday,09:30,1227452.0,0.0,234652.0,161273.0,189153.0,1812530.0,0.7816,0.2184
4,2397,Digimon Adventure: Bokura no War Game!,2000-03-04,2000-03-04,This movie takes place after the Adventure ser...,7.77,924,2135,70125,43599,white,movie,finished_airing,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na...",1,original,2460,pg,"[{'id': 18, 'name': 'Toei Animation'}]",2000.0,winter,NIL,NIL,653.0,63538.0,285.0,314.0,5432.0,70222.0,0.9915,0.0085


In [64]:
def export_df_to_csv(data_clean, file_path):
    new_file_path = file_path[0][:-4] + '_cleaned' + '.csv'
    data_clean.to_csv(new_file_path, index=False)

In [65]:
export_df_to_csv(data_clean, file_path)

---

# Clean and process rankings csv files

In [19]:
# 1. create a function for the handling of NaN and dropping of unused features (part 1)
# 2. Use the above functions to convert to json and split the features
# 3. convert to csv following the above method!!

# NOTE: Do this iteratively.

---

# Making Time Series Data
## 1. Genres Time Series [1999 to 2021]
- We will use `start_season_year` and `genres`

In [32]:
anime_df = pd.read_csv('dataset/anime_cleaned.csv')

In [33]:
genres_time_series_df = anime_df.loc[:, ['start_season_year', 'genres']]
genres_time_series_df.head()

Unnamed: 0,start_season_year,genres
0,1999.0,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'..."
1,1999.0,"[{'id': 1, 'name': 'Action'}, {'id': 8, 'name'..."
2,1979.0,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na..."
3,1999.0,"[{'id': 1, 'name': 'Action'}, {'id': 2, 'name'..."
4,2000.0,"[{'id': 2, 'name': 'Adventure'}, {'id': 4, 'na..."


### Convert `genres` feature to json for exploration:

In [34]:
# convert to json
genres_time_series_df = json_genres(genres_time_series_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['genres'][row] = genres


In [35]:
# drop anime with no genres

print('Number of rows before:', len(genres_time_series_df))

for row in range(len(anime_time_series_df)):
    if genres_time_series_df['genres'][row][0]['name'] == 'no_genre':
        genres_time_series_df.drop(index=row, inplace=True)

# reset the index
genres_time_series_df = genres_time_series_df.reset_index(drop=True)

print('Number of rows left:', len(genres_time_series_df))

Number of rows before: 8777
Number of rows left: 8769


In [36]:
# for each anime, split the genres into individual genre, 
# and add as new row with associated start_season_year, 

index_count = 8769

for row in range(len(genres_time_series_df)):
    start_season_year = genres_time_series_df['start_season_year'][row]
    
    for genre in genres_time_series_df['genres'][row]:
        genres_time_series_df = genres_time_series_df.append(
            pd.DataFrame({"start_season_year": start_season_year, "genres": genre})
            .drop(index=['id'])
            .rename(index={'name': index_count}))
        index_count += 1


# drop previous rows
genres_time_series_df = genres_time_series_df[8769:]
genres_time_series_df = genres_time_series_df.reset_index(drop=True)
genres_time_series_df

Unnamed: 0,start_season_year,genres
0,1999.0,Action
1,1999.0,Adventure
2,1999.0,Drama
3,1999.0,Mecha
4,1999.0,Military
...,...,...
31924,2021.0,Music
31925,2021.0,Supernatural
31926,2021.0,Vampire
31927,2010.0,Comedy


In [37]:
# rename 'genres' as 'genre'
genres_time_series_df.rename({'genres': 'genre'}, axis="columns", inplace=True)

# drop rows with 'Award Winning' genre since it is not really a genre and it only has 3 rows
print(f"Number of rows before removing 'Award Winning': {len(genres_time_series_df)}")
genres_time_series_df.drop(genres_time_series_df[genres_time_series_df.genre == 'Award Winning'].index, inplace=True)
print(f"Number of rows after removing 'Award Winning': {len(genres_time_series_df)}")

genres_time_series_df

Number of rows before removing 'Award Winning': 31929
Number of rows after removing 'Award Winning': 31926


Unnamed: 0,start_season_year,genre
0,1999.0,Action
1,1999.0,Adventure
2,1999.0,Drama
3,1999.0,Mecha
4,1999.0,Military
...,...,...
31924,2021.0,Music
31925,2021.0,Supernatural
31926,2021.0,Vampire
31927,2010.0,Comedy


In [38]:
# group according to start_season_year and genres
genres_time_series_df = genres_time_series_df.groupby(['start_season_year', 'genre']).size().reset_index().rename(columns={0: 'count'})


In [39]:
genres_time_series_df

Unnamed: 0,start_season_year,genre,count
0,1969.0,Comedy,44
1,1969.0,Slice of Life,44
2,1979.0,Adventure,21
3,1979.0,Comedy,21
4,1979.0,Fantasy,21
...,...,...,...
924,2021.0,Sports,11
925,2021.0,Super Power,13
926,2021.0,Supernatural,57
927,2021.0,Suspense,8


In [40]:
# create a list for genres
genres_list = [genre for genre in set(genres_time_series_df['genre'])]
print('Number of unique genres:', len(genres_list))


# create a new dataframe for as timeseries
column_names = ['Start Season Year']
column_names.extend([genre for genre in genres_list])

new_genres_time_series_df = pd.DataFrame(columns=column_names)
new_genres_time_series_df.head()

Number of unique genres: 40


Unnamed: 0,Start Season Year,School,Suspense,Mystery,Adventure,Slice of Life,Sports,Martial Arts,Space,Comedy,...,Shounen,Game,Shoujo,Sci-Fi,Military,Girls Love,Seinen,Fantasy,Police,Demons


In [41]:
# add rows into the timeseries dataframe
#for row in test:


for row in range(len(genres_time_series_df)):
    skip = False
    
    single_year_row = {}

    start_season_year = genres_time_series_df['start_season_year'][row]
    
    # skip years earlier than 1999
    if start_season_year < 1999.0:
        continue
    
    # if start season year already exists in the dataframe, just add 
    for year in new_genres_time_series_df['Start Season Year']:
        if start_season_year == year:
            # add to dataframe
            genre = genres_time_series_df['genre'][row]
            genre_count = genres_time_series_df['count'][row]

            new_genres_time_series_df.loc[new_genres_time_series_df['Start Season Year'] == start_season_year, genre] = genre_count
            
            skip = True
            break
        
    if skip:
        continue
    
    single_year_row['Start Season Year'] = [start_season_year]
    
    for genre in genres_list:
        # add to dictionary the start season year and count
        if genre == genres_time_series_df['genre'][row]:
            single_year_row[genre] = [genres_time_series_df['count'][row]]
        else:
            single_year_row[genre] = [0]
    
    # add to dataframe
    new_genres_time_series_df = new_genres_time_series_df.append(pd.DataFrame(single_year_row), ignore_index=True)
    
new_genres_time_series_df

Unnamed: 0,Start Season Year,School,Suspense,Mystery,Adventure,Slice of Life,Sports,Martial Arts,Space,Comedy,...,Shounen,Game,Shoujo,Sci-Fi,Military,Girls Love,Seinen,Fantasy,Police,Demons
0,1999.0,3,1,20,127,35,0,0,6,167,...,103,0,1,38,8,0,6,108,0,6
1,2000.0,14,0,18,144,12,10,4,12,172,...,88,26,22,76,8,0,6,97,5,32
2,2001.0,34,0,14,110,36,41,8,15,161,...,95,6,25,88,10,1,10,85,5,4
3,2002.0,40,0,27,149,51,23,25,19,234,...,111,21,32,127,22,0,20,101,15,8
4,2003.0,18,0,20,148,26,26,6,8,168,...,110,12,20,126,14,1,13,77,10,19
5,2004.0,31,13,34,155,23,23,9,20,223,...,169,23,34,127,30,7,34,87,12,17
6,2005.0,24,3,24,102,30,23,5,9,202,...,139,16,32,124,27,0,25,98,5,3
7,2006.0,33,6,31,112,23,6,13,8,212,...,108,15,33,82,29,4,32,109,9,26
8,2007.0,41,8,35,112,24,19,48,10,166,...,86,8,23,58,13,0,27,81,9,6
9,2008.0,54,11,25,81,35,17,7,8,167,...,79,21,31,62,17,5,20,79,6,9


In [42]:
# Export as csv
new_genres_time_series_df.to_csv('dataset/genres_timeseries_cleaned.csv', index=False)