**Import dependencies**

In [36]:
import pandas as pd
import re
import numpy as np
import sklearn
import imblearn
import sklearn.inspection
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

**Read in manga dataframe from CSV**

In [2]:
df_original = pd.read_csv('manga.csv', na_values = ['','nan','None'])

**Data wrangling** 

Replace NaNs with zeroes

In [3]:
updated_df_1 = df_original
updated_df_1["chapters"] = df_original["chapters"].fillna(value=-1)
updated_df_1["volumes"] = updated_df_1["volumes"].fillna(value=-1)


numeric_columns = df_original.select_dtypes(include=['number']).columns

updated_df_1 = df_original
updated_df_1[numeric_columns] = updated_df_1[numeric_columns].fillna(value=0)

Adding single column indicating which demographic the manga belongs to and deleting separate columns specifying this

In [4]:
demographic_labels = ["Shounen","Shoujo","Seinen","Josei"]
demo_only_df = updated_df_1.loc[:,demographic_labels]

# Finding instances with no clear demographic tag
total_pct_per_row = demo_only_df.sum(axis=1).tolist()
indices = [index for index, element in enumerate(total_pct_per_row) if element == 0]

# Finding best fit for each row, aside from no-tag cases
demo_column_1 = demo_only_df.idxmax(axis=1).tolist()
demo_column_2 = demo_column_1

# Replacing best fit demographic with "Unknown" in cases with no clear demographic tag
for index in indices:
    demo_column_2[index] = "Unknown"

# Adding column
updated_df_2 = updated_df_1
updated_df_2["Demographic"] = demo_column_2

# Removing demographic tag columns
updated_df_3 = updated_df_2.drop(demographic_labels, axis=1)

  updated_df_2["Demographic"] = demo_column_2


Generating popularity column by summing all status columns, recalculating status raw numbers to percentage of overall statuses

In [5]:
# Getting current list of columns
columns_list_v3 = updated_df_3.columns.tolist()

# Creating list of status columns only
status_regex = re.compile("status_")
status_raws_cols = list(filter(status_regex.match, columns_list_v3))

# Summing status columns only
popularity_list = updated_df_3[status_raws_cols].sum(axis=1).tolist()

# Adding new column
updated_df_4 = updated_df_3
updated_df_4["Popularity"] = popularity_list

# Recalculating statuses to be a percentage of popularity
for column in status_raws_cols:
    updated_df_4 = updated_df_4.assign(**{column: updated_df_4[column]/updated_df_4["Popularity"]})

# Renaming status columns to reflect their new meaning (this also renames score columns, which will be updated later)
updated_df_4.columns = updated_df_4.columns.str.replace("_count", "_pct", regex=False)

Calculating total number of ratings across each manga, converting the count from each score bin into a percentage of the total ratings, and then transforming the total number of ratings into a percentage of the total popularity of the manga.

In [6]:
# Getting current list of columns
columns_list_v4 = updated_df_4.columns.tolist()

# Creating list of score columns only
score_regex = re.compile("scored_")
score_raws_cols = list(filter(score_regex.match, columns_list_v4))

# Summing score columns only
score_list = updated_df_4[score_raws_cols].sum(axis=1).tolist()

# Adding new column
updated_df_5 = updated_df_4
updated_df_5["Scored_Percentage"] = score_list

# Recalculating statuses to be a percentage of popularity
for column in score_raws_cols:
    updated_df_5 = updated_df_5.assign(**{column: updated_df_5[column]/updated_df_5["Scored_Percentage"]})

updated_df_5 = updated_df_5.assign(Scored_Percentage = updated_df_5["Scored_Percentage"]/updated_df_5["Popularity"])

numeric_columns = updated_df_5.select_dtypes(include=['number']).columns
updated_df_5[numeric_columns] = updated_df_5[numeric_columns].fillna(value=0)

Transforming favorites column into a percentage of popularity as well

In [7]:
updated_df_6 = updated_df_5.assign(favorites = updated_df_5["favorites"]/updated_df_5["Popularity"])

Calculating run length for completed works, filling -1 for ongoing works

In [8]:
updated_df_7 = updated_df_6.assign(run_length = updated_df_6["end_date_days"]-updated_df_6["start_date_days"])
updated_df_7["run_length"] = updated_df_7["run_length"].where(cond=updated_df_7["run_length"]>-1, other=-1)

One-hot encoding for categorical variables; removing unnecessary columns

In [9]:
updated_df_8 = updated_df_7.drop(["id", "eng_title", "rom_title","start_date","end_date"], axis=1)
updated_df_8 = pd.get_dummies(updated_df_8, dummy_na=True, columns=["status","source","country"])

**Building First Classifier**

Splitting into training and testing sets

In [10]:
training_df = updated_df_8.query("Demographic != 'Unknown'")
training_x = training_df.drop(columns=["Demographic"])
training_y = training_df[["Demographic"]]
testing_df = updated_df_8.query("Demographic == 'Unknown'")

Creating and fitting balanced random forest model

In [11]:
rfBaseline = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=500, oob_score = True, replacement=False, bootstrap=True, random_state=1234)

rfBaseline.fit(training_x, np.ravel(training_y))

  warn(


BalancedRandomForestClassifier(bootstrap=True, n_estimators=500, oob_score=True,
                               random_state=1234, replacement=False)

Getting OOB classification report

In [12]:
classorder = rfBaseline.classes_
OOBdecisionfunc = rfBaseline.oob_decision_function_

OOBPreds = pd.DataFrame(data=OOBdecisionfunc, columns=classorder).idxmax(axis=1).tolist()

sklearn.metrics.classification_report(training_y, OOBPreds, labels=["Shounen","Shoujo", "Seinen", "Josei"], output_dict=True)

{'Shounen': {'precision': 0.5957152729785764,
  'recall': 0.5910181693520741,
  'f1-score': 0.5933574255721906,
  'support': 2917},
 'Shoujo': {'precision': 0.7133182844243793,
  'recall': 0.657511444028298,
  'f1-score': 0.6842789086184495,
  'support': 2403},
 'Seinen': {'precision': 0.6035332785538209,
  'recall': 0.4469120778825677,
  'f1-score': 0.5135465827652508,
  'support': 3287},
 'Josei': {'precision': 0.2745205479452055,
  'recall': 0.6583442838370565,
  'f1-score': 0.3874709976798144,
  'support': 761},
 'accuracy': 0.5629803586678053,
 'macro avg': {'precision': 0.5467718459754956,
  'recall': 0.588446493774999,
  'f1-score': 0.5446634786589264,
  'support': 9368},
 'weighted avg': {'precision': 0.6025330179700044,
  'recall': 0.5629803586678053,
  'f1-score': 0.5719512035213421,
  'support': 9368}}

Our macro average recall is 59%, which is markedly higher than the roughly 25% we would see if the classifier was guessing at random.

**Refining Model**

Let's start by checking pairwise correlation coefficients between each set of our predictors, and removing those which are more than 80% correlated.

In [13]:
temp = training_x.corr().stack().reset_index()
temp_2 = temp.rename(columns={"level_0":"first_var", "level_1":"second_var", 0:"correlation"}, inplace=False)
temp_2["correlation"] = temp_2["correlation"].abs()

temp_3 = temp_2.query("first_var != second_var").sort_values("correlation", ascending=False)

remvar = []
temp_vals = training_x

while temp_3.iloc[0,2] >= .8:
    remvar.append(temp_3.iloc[0,0])
    temp_vals = temp_vals.drop(remvar[-1], axis=1)
    temp_3 = temp_vals.corr().stack().reset_index().rename(columns={"level_0":"first_var", "level_1":"second_var", 0:"correlation"}).query("first_var != second_var")
    temp_3["correlation"] = temp_3["correlation"].abs()
    temp_3 = temp_3.sort_values("correlation", ascending=False)


Next, let's create a randomized feature to use for variable selection so that we can determine which variables to remove in order to improve speed without sacrificing too much information.

In [14]:
""" train_w_random = temp_vals
train_w_random["RANDOM"] = np.random.RandomState(1234).randn(train_w_random.shape[0])

featselectrf = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=100, oob_score = True, replacement=False, bootstrap=True, random_state=1234)
featselectrf.fit(train_w_random, np.ravel(training_y))

varimp_forselection = pd.DataFrame({"names": train_w_random.columns, "imp": featselectrf.feature_importances_}).sort_values("imp", ascending=True)
for_removal = []

while varimp_forselection[varimp_forselection.imp < varimp_forselection.query("names == 'RANDOM'").iloc[0,1]].shape[0] > 0:
    if varimp_forselection[varimp_forselection.imp < varimp_forselection.query("names == 'RANDOM'").iloc[0,1]].shape[0] >= 25:
        for_removal = varimp_forselection.iloc[0:25,0]
    else:
        num_remove = varimp_forselection[varimp_forselection.imp < varimp_forselection.query("names == 'RANDOM'").iloc[0,1]].shape[0]
        for_removal = varimp_forselection.iloc[:num_remove,0]
    train_w_random = train_w_random.drop(for_removal, axis=1)
    featselectrf.fit(train_w_random, np.ravel(training_y))
    varimp_forselection = pd.DataFrame({"names": train_w_random.columns, "imp": featselectrf.feature_importances_}).sort_values("imp", ascending=True)

print("remaining columns"+train_w_random.columns) """

' train_w_random = temp_vals\ntrain_w_random["RANDOM"] = np.random.RandomState(1234).randn(train_w_random.shape[0])\n\nfeatselectrf = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=100, oob_score = True, replacement=False, bootstrap=True, random_state=1234)\nfeatselectrf.fit(train_w_random, np.ravel(training_y))\n\nvarimp_forselection = pd.DataFrame({"names": train_w_random.columns, "imp": featselectrf.feature_importances_}).sort_values("imp", ascending=True)\nfor_removal = []\n\nwhile varimp_forselection[varimp_forselection.imp < varimp_forselection.query("names == \'RANDOM\'").iloc[0,1]].shape[0] > 0:\n    if varimp_forselection[varimp_forselection.imp < varimp_forselection.query("names == \'RANDOM\'").iloc[0,1]].shape[0] >= 25:\n        for_removal = varimp_forselection.iloc[0:25,0]\n    else:\n        num_remove = varimp_forselection[varimp_forselection.imp < varimp_forselection.query("names == \'RANDOM\'").iloc[0,1]].shape[0]\n        for_removal = varimp_forselecti

Now let's train and validate a model based on the reamining 12 features.

In [15]:
""" train_w_random_1 = train_w_random.drop("RANDOM", axis=1)

rfReduced = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=500, oob_score = True, replacement=False, bootstrap=True, random_state=1234)

rfReduced.fit(train_w_random_1, np.ravel(training_y))

classorder = rfReduced.classes_
OOBdecisionfunc = rfReduced.oob_decision_function_

OOBPreds = pd.DataFrame(data=OOBdecisionfunc, columns=classorder).idxmax(axis=1).tolist()

sklearn.metrics.classification_report(training_y, OOBPreds, labels=["Shounen","Shoujo", "Seinen", "Josei"], output_dict=True) """

' train_w_random_1 = train_w_random.drop("RANDOM", axis=1)\n\nrfReduced = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=500, oob_score = True, replacement=False, bootstrap=True, random_state=1234)\n\nrfReduced.fit(train_w_random_1, np.ravel(training_y))\n\nclassorder = rfReduced.classes_\nOOBdecisionfunc = rfReduced.oob_decision_function_\n\nOOBPreds = pd.DataFrame(data=OOBdecisionfunc, columns=classorder).idxmax(axis=1).tolist()\n\nsklearn.metrics.classification_report(training_y, OOBPreds, labels=["Shounen","Shoujo", "Seinen", "Josei"], output_dict=True) '

Well, that didn't work at all, likely because the random variable had much higher cardinality than many other variables. Let's try permutation-based importance scores instead.

In [16]:
rfPermutation = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=100, oob_score = True, replacement=False, bootstrap=True, random_state=1234)
rfPermutation.fit(temp_vals, np.ravel(training_y))

permute_results = sklearn.inspection.permutation_importance(estimator=rfPermutation, X = temp_vals, y = training_y, scoring="recall_macro", n_repeats=3)

  warn(


In [17]:
perm_importance = pd.DataFrame()
perm_importance["feature"] = temp_vals.columns
perm_importance["importance"] = permute_results["importances_mean"]

Let's try removing any variable that isn't associated with an average of at least a 1/20th of a percent increase in macro averaged recall. This is somewhat arbitrary, but will hopefully result in a much more efficient random forest with a negligible decrease in recall.

In [18]:
perm_features_tokeep = perm_importance.query("importance >= .0005").feature

refined_training = temp_vals[perm_features_tokeep]

rfReduced2 = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=500, oob_score = True, replacement=False, bootstrap=True, random_state=1234)

rfReduced2.fit(refined_training, np.ravel(training_y))

classorder = rfReduced2.classes_
OOBdecisionfunc = rfReduced2.oob_decision_function_

OOBPreds = pd.DataFrame(data=OOBdecisionfunc, columns=classorder).idxmax(axis=1).tolist()

sklearn.metrics.classification_report(training_y, OOBPreds, labels=["Shounen","Shoujo", "Seinen", "Josei"], output_dict=True)

  warn(


{'Shounen': {'precision': 0.5914848698099929,
  'recall': 0.5762769969146383,
  'f1-score': 0.5837819065810036,
  'support': 2917},
 'Shoujo': {'precision': 0.7040540540540541,
  'recall': 0.6504369538077404,
  'f1-score': 0.6761842959117457,
  'support': 2403},
 'Seinen': {'precision': 0.5955940204563336,
  'recall': 0.4606023729844843,
  'f1-score': 0.5194716074798421,
  'support': 3287},
 'Josei': {'precision': 0.2687074829931973,
  'recall': 0.6228646517739816,
  'f1-score': 0.37544554455445545,
  'support': 761},
 'accuracy': 0.5584970111016225,
 'macro avg': {'precision': 0.5399601068283946,
  'recall': 0.5775452438702112,
  'f1-score': 0.5387208386317617,
  'support': 9368},
 'weighted avg': {'precision': 0.5955814685018609,
  'recall': 0.5584970111016225,
  'f1-score': 0.5679952943813935,
  'support': 9368}}

Macro average recall decreased by about 1%; next we can tune hyperparameters in order to improve our results.

In [23]:
import sklearn.model_selection


hyperparameter_dict = {"n_estimators": [100, 200, 500, 800, 1300, 1800, 2000, 2500],
                       "criterion": ["gini", "entropy"],
                       "max_depth": [None, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60],
                       "min_samples_split": [2, 3, 4, 5, 7, 10, 15],
                       "min_samples_leaf": [1, 2, 4, 5],
                       "bootstrap": [False],
                       "sampling_strategy": ["majority", "not minority", "not majority", "all"],
                       "replacement": [True],
                       "random_state": [1234],
                       "class_weight": [None, "balanced", "balanced_subsample"]
                       }

RandomSearchrf = imblearn.ensemble.BalancedRandomForestClassifier()

rf_random = sklearn.model_selection.RandomizedSearchCV(estimator = RandomSearchrf, param_distributions = hyperparameter_dict, n_iter = 30, cv = 3, verbose=2, random_state=1234, scoring="recall_macro")

randsearch_output = rf_random.fit(refined_training, np.ravel(training_y))

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=3, n_estimators=1300, random_state=1234, replacement=True, sampling_strategy=all; total time=   7.7s
[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=3, n_estimators=1300, random_state=1234, replacement=True, sampling_strategy=all; total time=   7.4s
[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=3, n_estimators=1300, random_state=1234, replacement=True, sampling_strategy=all; total time=   7.4s
[CV] END bootstrap=False, class_weight=balanced, criterion=gini, max_depth=30, min_samples_leaf=1, min_samples_split=3, n_estimators=200, random_state=1234, replacement=True, sampling_strategy=not majority; total time=   2.5s
[CV] END bootstrap=False, cla

Next, let's see what our best classifier looked like!

In [33]:
randsearch_output.best_estimator_

BalancedRandomForestClassifier(bootstrap=False, max_depth=40,
                               min_samples_leaf=4, min_samples_split=3,
                               n_estimators=500, random_state=1234,
                               replacement=True,
                               sampling_strategy='not minority')

Now we can go ahead and split our training data into training and validation, and compare a baseline classifier to our new classifier.

In [42]:


baseline_classifier_2 = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators=100, replacement=True, bootstrap=False, random_state=1234)
cross_validator = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1234)
baseline_valid_results = sklearn.model_selection.cross_val_score(baseline_classifier_2, X=refined_training, y=np.ravel(training_y), groups=None, scoring="recall_macro",cv=cross_validator)
baseline_valid_results.mean()


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


0.5970270424886259

In [43]:
xvalidated_classifier = randsearch_output.best_estimator_

optimal_model_results = sklearn.model_selection.cross_val_score(xvalidated_classifier, X=refined_training, y=np.ravel(training_y), groups=None, scoring="recall_macro",cv=cross_validator)
optimal_model_results.mean()

0.6007666205803488

With a macro average recall of 60% versus the 60% of the baseline classifier, we have a classifier that performs reasonably well overall.