**Setup**

Importing modules.

In [None]:
import os
import pandas as pd
import numpy as np
import sklearn
import imblearn
import sklearn.inspection
from itables import init_notebook_mode

Loading in data from csv.

In [None]:
df_original = pd.read_csv("manga.csv")
mod_date = os.path.getmtime("manga.csv")

**Data Exploration Phase 1**

Showing the summary statistics for the data frame.

In [None]:
init_notebook_mode(all_interactive=True)

df_original.describe(include="all").transpose()

In [None]:
df_original.isnull().sum()

**Data Wrangling**

Removing columns that are not used for prediction.

In [None]:
updated_df_1 = df_original.drop(["id","eng_title","rom_title"], axis=1)

Handling missing values where the missingness actually means the value is effectively zero.

In [None]:
updated_df_2 = updated_df_1
zeroed_columns = updated_df_2.select_dtypes(include=['number']).drop(["start_year","start_month","start_day","end_year","end_month","end_day","chapters","volumes"], axis=1).columns
updated_df_2[zeroed_columns] = updated_df_2[zeroed_columns].fillna(value=0)

Checking the missing rows for status manually.

In [None]:
df_original.query("status.isnull()")

In [None]:
df_original.groupby("status").agg({"status":"count"})

It's very likely that the first and third manga are releasing (have start dates but not end dates) and the second manga is completed (has both start and end dates, and cancelled manga are a small proportion of the data), so we will manually fill those values.

In [None]:
updated_df_3 = updated_df_2

updated_df_3.loc[1272,"status"] = "RELEASING"
updated_df_3.loc[2943,"status"] = "FINISHED"
updated_df_3.loc[6862,"status"] = "RELEASING"

Converting status and country to one-hot encoded features in preparation for modelling.

In [None]:
updated_df_4 = pd.get_dummies(updated_df_3, dummy_na=False, columns=["status","country"])

Creating a classifier to impute missing values for source.

In [None]:
df_original.groupby("source").agg({"source":"count"})

In [None]:
simple_df_1 = updated_df_4.drop(["chapters","volumes","start_month","start_day","end_year","end_month","end_day"], axis=1)
source_training = simple_df_1.dropna(subset="source")

# Using balanced random forest on account of highly imbalanced classes
source_classifier = imblearn.ensemble.BalancedRandomForestClassifier(random_state=1234, sampling_strategy="all",replacement=True,bootstrap=False, n_estimators=150)

lightweight_cross_validator = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=1234)

source_imputation_cv = sklearn.model_selection.cross_val_predict(estimator=source_classifier, X=source_training.drop("source", axis=1), y=np.ravel(source_training["source"]), cv=lightweight_cross_validator, method='predict')

sklearn.metrics.ConfusionMatrixDisplay.from_predictions(np.ravel(source_training["source"]), source_imputation_cv, labels=["LIGHT_NOVEL","MANGA","ORIGINAL","OTHER","VIDEO_GAME","VISUAL_NOVEL"],normalize="true")

In [None]:
source_classifier.fit(X=source_training.drop("source", axis=1), y=np.ravel(source_training["source"]))

source_preds = source_classifier.predict(updated_df_4.drop(["chapters","volumes","start_month","start_day","end_year","end_month","end_day","source"], axis=1))

In [None]:
updated_df_5 = updated_df_4
updated_df_5 = updated_df_5.assign(imputed_source = source_preds)

conditions_1 = [
    updated_df_5["source"].isna(),
    updated_df_5["source"].notna()
]

choices_1 = [
    updated_df_5["imputed_source"],
    updated_df_5["source"]
]

updated_df_5 = updated_df_5.assign(source = np.select(condlist=conditions_1, choicelist=choices_1, default=None) ).drop("imputed_source", axis=1)

Adding one-hot encoding to the source feature as well.

In [None]:
updated_df_6 = pd.get_dummies(updated_df_5, dummy_na=False, columns=["source"])

Using stripped-down data frame as input for simple random forest to impute start years. Comparing cross-validated results with simply taking the median (the random forest performs better).

In [None]:
simple_df_1 = updated_df_6.drop(["chapters","volumes","start_month","start_day","end_year","end_month","end_day"], axis=1)

start_year_training = simple_df_1.dropna(subset="start_year")

start_year_regressor = sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=1234, max_features = .15)

baseline_results = sklearn.model_selection.cross_val_score(start_year_regressor, X=start_year_training.drop("start_year", axis=1), y=np.ravel(start_year_training["start_year"]), groups=None, scoring="neg_root_mean_squared_error",cv=lightweight_cross_validator)
baseline_results.mean()

In [None]:
((start_year_training["start_year"] - start_year_training["start_year"].median())**2).mean()**.5

In [None]:
start_year_regressor.fit(start_year_training.drop("start_year", axis=1), np.ravel(start_year_training["start_year"]), )
start_year_preds = start_year_regressor.predict(simple_df_1.drop("start_year", axis=1)).round()

In [None]:
updated_df_7 = updated_df_6.assign(imputed_start_year = start_year_preds)

In [None]:
conditions_2 = [
    pd.notna(updated_df_7["start_year"]),
    pd.isna(updated_df_7["start_year"]) & pd.isna(updated_df_7["end_year"]),
    pd.isna(updated_df_7["start_year"]) & updated_df_7["imputed_start_year"] <= updated_df_7["end_year"],
    pd.isna(updated_df_7["start_year"]) & updated_df_7["imputed_start_year"] > updated_df_7["end_year"]
]

choices_2 = [
    updated_df_7["start_year"],
    updated_df_7["imputed_start_year"],
    updated_df_7["imputed_start_year"],
    updated_df_7["end_year"]
]

updated_df_8 = updated_df_7.assign(start_year = np.select(condlist=conditions_2, choicelist=choices_2, default=None).astype(np.float64))

Imputing start month using median value for each start year (using July when there are no non-missing values for that year).

In [None]:
start_month_imputation_key = updated_df_8.groupby("start_year").agg({"start_month": "median"}).round().fillna(7)

updated_df_9 = updated_df_8.join(other=start_month_imputation_key, on="start_year", rsuffix="_imputed")

conditions_3 = [
    pd.isna(updated_df_9["start_month"]),
    pd.notna(updated_df_9["start_month"])
]

choices_3 = [
    updated_df_9["start_month_imputed"],
    updated_df_9["start_month"]
]

updated_df_9 = updated_df_9.assign(start_month = np.select(condlist =conditions_3, choicelist=choices_3, default=None) ).astype(np.float64).drop("start_month_imputed", axis=1)

Imputing start day using median value for each start year and month combination (using the 14th when there are no non-missing values for that combination).

In [None]:
start_day_imputation_key = updated_df_9.groupby(["start_year","start_month"]).agg({"start_day": "median"}).round().fillna(14)

updated_df_10 = updated_df_9.join(other=start_day_imputation_key, on=["start_year","start_month"], rsuffix="_imputed")

conditions_4 = [
    pd.isna(updated_df_10["start_day"]),
    pd.notna(updated_df_10["start_day"])
]

choices_4 = [
    updated_df_10["start_day_imputed"],
    updated_df_10["start_day"]
]

updated_df_10 = updated_df_10.assign(start_day = np.select(condlist =conditions_4, choicelist=choices_4, default=None) ).astype(np.float64).drop("start_day_imputed", axis=1)

#TODO: impute some end years by assigning most recent data-gathering date to those currently running.