# Step 4: Final Data Cleaning and Fixes

**Metis Project 2, Andrew Zhou**

We do some final cleaning passes and add some columns to the dataframe.

In [255]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from scipy import stats

In [256]:
mal_info_df = pd.read_pickle("../data/mal_info_df.pickle")
anime_sales_df = pd.read_pickle("../data/anime_sales_df_linked.pickle")

Add the MAL info.

In [257]:
anime_sales_df = pd.concat([anime_sales_df.loc[anime_sales_df.index], mal_info_df], join="inner", axis=1)

Parse and typecast our sales data, and make a new column.

In [258]:
anime_sales_df["sales"] = anime_sales_df["gross_1st_rls"].map(lambda x: x[1:].replace(",", "")).astype(float)

Drop certain unknown data.

In [259]:
anime_sales_df = anime_sales_df[anime_sales_df["sales"] != 0]
anime_sales_df = anime_sales_df[anime_sales_df["rating"] != "None"]
anime_sales_df = anime_sales_df[anime_sales_df["duration"] != "Unknown"]
anime_sales_df = anime_sales_df.dropna(subset=["score"])

Modify and typecast some columns and make some new ones.

In [260]:
cols_to_int = ["duration", "episodes", "year", "members"]
anime_sales_df[cols_to_int] = anime_sales_df[cols_to_int].astype(int)
anime_sales_df["broadcast"] = anime_sales_df["broadcast"].apply(lambda x: (None, None) if pd.isnull(x) else x)
anime_sales_df[["day", "time"]] = pd.DataFrame(anime_sales_df["broadcast"].tolist(), index=anime_sales_df.index)
anime_sales_df["hour"] = anime_sales_df["time"].apply(lambda x: int(x[:2]) if x else None)
anime_sales_df["runtime"] = anime_sales_df["episodes"]*anime_sales_df["duration"]
anime_sales_df["score"] = anime_sales_df["score"].astype(float)
anime_sales_df["favorites"] = anime_sales_df["favorites"].astype(int)

Make a binary variable (0 or 1) for each genre and add these variables to the 
dataframe.

In [269]:
# series of series, each series has columns of genres and 1 if it's of that genre
ser_ser = anime_sales_df["genres"].map(lambda x: pd.Series(1, index=x))
# turn series of series into a df with same indices as original df
genre_df = ser_ser.apply(lambda x: x)
genre_df = genre_df.fillna(0)

anime_sales_df = pd.concat([anime_sales_df, genre_df], axis=1)


Drop certain outliers. Decided based on exploring with the data and noting that the outliers cause significant instability in training and cross-validation scores across different train-val and k-fold CV splits.

In [262]:
anime_sales_df = anime_sales_df[anime_sales_df["sales"] < 500]
anime_sales_df = anime_sales_df[(np.abs(stats.zscore(anime_sales_df[["score", "duration", "members", "favorites"]])) < 2).all(axis=1)]

Put sales at the end of all the columns for neatness's sake; also remove some unnecessary columns.

In [263]:
column_list = anime_sales_df.columns.tolist()
column_list.remove("sales")
column_list.append("sales")

column_list = list(filter(lambda col: col not in ["avg_sales", "re_rls", "total", "gross_1st_rls", "Cars", "Dementia"], column_list))

anime_sales_df = anime_sales_df[column_list]

Remove some uncommon values for sources.

In [264]:
sources_by_freq = anime_sales_df.groupby("source")["title"].count().sort_values(ascending=False)
sources_by_freq

source
manga                         593
original                      262
novel                         254
visual novel                  105
game                           72
historical work                 4
pachinko                        3
other (visual combat book)      3
other (songs)                   3
other (figures)                 2
stage play                      1
spin-off (live action)          1
drama CD                        1
doujinshi                       1
comic book                      1
card game                       1
Name: title, dtype: int64

In [265]:
common_sources = sources_by_freq.sort_values(ascending=False).head(5).index.tolist()
anime_sales_df["source"] = anime_sales_df["source"].apply(lambda x: x if x in common_sources else None)

In [266]:
anime_sales_df.head()

Unnamed: 0,title,year,season_qtr,studio,source,link,episodes,broadcast,genres,duration,...,Martial Arts,Psychological,Shoujo Ai,Space,Samurai,Horror,Thriller,Vampire,Shounen Ai,sales
.hack//SIGN,.hack//SIGN,2002,Spring,Bee Train,game,https://myanimelist.net/anime/48/hack__Sign,26,"(None, None)","[Game, Sci-Fi, Adventure, Mystery, Magic, Fant...",24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,362.2
.hack//Tasogare no Udewa Densetsu,.hack//Tasogare no Udewa Densetsu,2003,Winter,Bee Train,game,https://myanimelist.net/anime/298/hack__Tasoga...,12,"(None, None)","[Adventure, Comedy, Fantasy, Game, Sci-Fi, Sho...",23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102.8
.hack//Roots,.hack//Roots,2006,Spring,Bee Train,game,https://myanimelist.net/anime/873/hack__Roots,26,"(None, None)","[Adventure, Drama, Fantasy, Game, Sci-Fi]",24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0
009-1,009-1,2006,Fall,Ishimori Entertainment,manga,https://myanimelist.net/anime/1583/009-1,12,"(Friday, 01:25)","[Action, Mecha, Sci-Fi, Seinen]",25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.4
07-Ghost,07-Ghost,2009,Spring,Studio Deen,manga,https://myanimelist.net/anime/5525/07-Ghost,25,"(None, None)","[Action, Demons, Fantasy, Josei, Magic, Military]",23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,197.2


In [267]:
anime_sales_df.columns

Index(['title', 'year', 'season_qtr', 'studio', 'source', 'link', 'episodes',
       'broadcast', 'genres', 'duration', 'rating', 'score', 'members',
       'favorites', 'day', 'time', 'hour', 'runtime', 'Game', 'Sci-Fi',
       'Adventure', 'Mystery', 'Magic', 'Fantasy', 'Comedy', 'Shounen',
       'Drama', 'Action', 'Mecha', 'Seinen', 'Demons', 'Josei', 'Military',
       'Ecchi', 'Super Power', 'Supernatural', 'Music', 'Romance', 'School',
       'Slice of Life', 'Parody', 'Shoujo', 'Historical', 'Harem', 'Police',
       'Sports', 'Martial Arts', 'Psychological', 'Shoujo Ai', 'Space',
       'Samurai', 'Horror', 'Thriller', 'Vampire', 'Shounen Ai', 'sales'],
      dtype='object')

In [268]:
anime_sales_df.to_pickle("../data/anime_sales_df_final.pickle")