# Setup

Importing modules.

In [None]:
import pandas as pd
import numpy as np
from itables import init_notebook_mode

Loading in data from csv.

In [None]:
DF_ORIGINAL = pd.read_csv("../data/manga.csv")

Filling all null tag and genre features with zero, as that is their implicit value.

In [None]:
ZEROABLE_NUMERIC_COLUMN_NAMES = list(DF_ORIGINAL.drop(["id","chapters","volumes","start_year","start_month","start_day","end_year","end_month","end_day"],axis=1).select_dtypes(include=['number']).columns.values)
DEFAULT_ZEROES = [0] * len(ZEROABLE_NUMERIC_COLUMN_NAMES)
NULL_MAP = dict(zip(ZEROABLE_NUMERIC_COLUMN_NAMES, DEFAULT_ZEROES))
ZEROED_DF = DF_ORIGINAL.fillna(value=NULL_MAP)

Computing target feature from data.

In [None]:
SHOUNEN_TAG_PCT = ZEROED_DF["Shounen"]
SHOUJO_TAG_PCT = ZEROED_DF["Shoujo"]
SEINEN_TAG_PCT = ZEROED_DF["Seinen"]
JOSEI_TAG_PCT = ZEROED_DF["Josei"]

demo_col = []

for x in range(ZEROED_DF.shape[0]):
    demo_dict = {"Shounen": SHOUNEN_TAG_PCT[x],
                 "Shoujo": SHOUJO_TAG_PCT[x],
                 "Seinen": SEINEN_TAG_PCT[x],
                 "Josei": JOSEI_TAG_PCT[x]}
    
    if sum(demo_dict.values()) > 0:   
        max_vk = max(((v, k) for (k, v) in demo_dict.items()))
        demo_label = max_vk[1]
    else:
        demo_label = None
    
    demo_col.append(demo_label)
    
DEMO_ADDED_DF = (ZEROED_DF
                 .assign(demo_label = pd.Series(demo_col))
                 .drop(["Shounen","Shoujo","Seinen","Josei"], axis=1))

# Full-dataset summaries

Checking summary statistics on numeric features, excluding the media ID.

In [None]:
# Display data frames interactively
init_notebook_mode(all_interactive=True)

# Computing summary statistics for numeric columns, excluding the ID, adding in an explicit percentage of null values.
summary_df = DEMO_ADDED_DF.drop("id", axis=1).describe(include=np.number)
summary_df.loc["pct_null"] = [f"{pct:0.2%}" for pct in DEMO_ADDED_DF
                              .drop("id", axis=1)
                              .select_dtypes(include=np.number)
                              .isna().mean().tolist()]
summary_df = summary_df.transpose()
summary_df

Checking summary statistics for categorical features as well, excluding names of media.

In [None]:
def cat_summary_frame(df, colname, naincl = False):

    """
    Within the dataframe passed as the first argument to this function, 
    summarizes the count of each level of the feature whose name is 
    passed as the second argument, as well as the percentage of total 
    observations each count represents.
    """

    summ_df = df.groupby(colname, dropna = naincl).size().to_frame().rename(columns={0: "count"})
    summ_df = summ_df.assign(pct = round((summ_df["count"] / summ_df["count"].sum()) * 100, 2))
    summ_df = summ_df.assign(pct = summ_df["pct"].astype("string"))
    summ_df = summ_df.assign(pct = summ_df["pct"] + "%")
    return summ_df.sort_values("count", ascending=False).reset_index()

cat_summary_frame(DEMO_ADDED_DF, "status")

In [None]:
cat_summary_frame(DEMO_ADDED_DF, "source")

In [None]:
cat_summary_frame(DEMO_ADDED_DF, "country")

In [None]:
cat_summary_frame(DEMO_ADDED_DF, "demo_label")

In [None]:
# Running demo again to see the percentages of each label without missing data included.
cat_summary_frame(DEMO_ADDED_DF, "demo_label", True)

# Visualizing Data

Visualizing our data will be significantly easier if we use principal component analysis to reduce the number of features. However, PCA runs the risk of diminishing the interpretability of any visualizations we create from the principal components. 

In order to balance these concerns, we will:
1. Restrict our PCA feature reduction to tag features only, as they are both sparse and make up the majority of our features.
2. Perform PCA within groupings of tags, using the groups AniList defines for tags as a guide.

In [None]:
tag_appendix = pd.read_csv("../data/tag_reference.csv")
tag_appendix

In [None]:
cat_summary_frame(tag_appendix, "category")

In order to reduce the number of categories slightly (and avoid giving individual tags their own category), after looking at our the tags contained within each category, we will merge and rename categories for clarity.

In [None]:
tag_conditions = [
    tag_appendix["category"] == "Cast-Main Cast",
    tag_appendix["category"] == "Cast-Traits",
    tag_appendix["category"] == "Demographic",
    tag_appendix["category"] == "Setting-Scene",
    tag_appendix["category"] == "Setting-Time",
    tag_appendix["category"] == "Setting-Universe",
    tag_appendix["category"] == "Theme-Arts-Music",
    tag_appendix["category"] == "Theme-Sci-Fi-Mecha",
    tag_appendix["category"] == "Theme-Game-Card & Board Game",
    tag_appendix["category"] == "Theme-Game-Sport"
]

tag_choices = [
    "Cast",
    "Cast",
    "Technical",
    "Setting",
    "Setting",
    "Setting",
    "Theme-Arts",
    "Theme-Sci-Fi",
    "Theme-Game",
    "Theme-Sport"
]

tag_appendix_recat = tag_appendix.assign(category = np.select(condlist=tag_conditions, choicelist=tag_choices, default=tag_appendix["category"])).drop("description", axis=1)

In [None]:
cat_summary_frame(tag_appendix_recat, "category")