# Superhero Generator

Using Jonathan Besomi's [Superheroes NLP Dataset](https://www.kaggle.com/datasets/jonathanbesomi/superheroes-nlp-dataset).

In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import rv_discrete
import random

In [98]:
supes_data = pd.read_csv("data/superheroes_nlp_dataset.csv")
supes_data.head()

Unnamed: 0,name,real_name,full_name,overall_score,history_text,powers_text,intelligence_score,strength_score,speed_score,durability_score,...,has_flight,has_accelerated_healing,has_weapons_master,has_intelligence,has_reflexes,has_super_speed,has_durability,has_stamina,has_agility,has_super_strength
0,3-D Man,"Delroy Garrett, Jr.","Delroy Garrett, Jr.",6,"Delroy Garrett, Jr. grew up to become a track ...",,85,30,60,60,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,514A (Gotham),Bruce Wayne,,10,He was one of the many prisoners of Indian Hil...,,100,20,30,50,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,A-Bomb,Richard Milhouse Jones,Richard Milhouse Jones,20,"Richard ""Rick"" Jones was orphaned at a young ...","On rare occasions, and through unusual circu...",80,100,80,100,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Aa,Aa,,12,Aa is one of the more passive members of the P...,,80,50,55,45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Aaron Cash,Aaron Cash,Aaron Cash,5,Aaron Cash is the head of security at Arkham A...,,80,10,25,40,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Goal
Generate a random superhero with related information lining up as much as possible. Some features will be generated randomly, while some will be inferred.

- Randomly generated features: `has_...`
- Inferred features (supervised learning): `intelligence_score`, `strength_score`, `speed_score`, `durability_score`, `power_score`, `combat_score`, `overall_score`, `superpowers` (potentially)
- Inferred features (NLP): `history_text`, `powers_text`
- Dropped features: `name`, `real_name`, `full_name`, `alter_egos`, `aliases`, `place_of_birth`, `first_appearance`, `creator`, `occupation`, `base`, `teams`, `relatives`, `img`, `height`, `weight`, `eye_color`, `hair_color`, `skin_color`, `type_race`, `alignment`, `gender`

In [110]:
inferred_features_sup = ["overall_score", "intelligence_score", "strength_score", "speed_score", "durability_score", "power_score", "combat_score"]
inferred_features_nlp = ["history_text", "powers_text"]
inferred_features_list = ["superpowers"]
inferred_features = inferred_features_sup + inferred_features_nlp + inferred_features_list

random_features = [f for f in supes_data.columns if f.startswith("has_")]

dropped_features = [f for f in supes_data.columns if f not in (inferred_features + random_features)]

supe = pd.DataFrame(columns=(inferred_features + random_features))
supe.loc[0] = None

## Randomly Generated Features

In [230]:
for feat in random_features:
    counts = supes_data[feat].value_counts()
    count_0 = counts[0.0]
    count_1 = counts[1.0]
    total_count = count_0 + count_1
    
    rv = rv_discrete(a=0.0, b=1.0, values=([0, 1], [count_0/total_count, count_1/total_count]))
    supe[feat] = rv.rvs()

In [231]:
supe

Unnamed: 0,overall_score,intelligence_score,strength_score,speed_score,durability_score,power_score,combat_score,history_text,powers_text,superpowers,...,has_flight,has_accelerated_healing,has_weapons_master,has_intelligence,has_reflexes,has_super_speed,has_durability,has_stamina,has_agility,has_super_strength
0,,,,,,,,,,,...,0,0,0,1,0,0,1,1,1,1


## Inferred Features: Supervised Learning

In [209]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

In [210]:
# remove rows where overall_score = - or infinity
supes_data_os = supes_data[pd.to_numeric(supes_data['overall_score'], errors='coerce').notnull()]

train_df, test_df = train_test_split(supes_data_os[(inferred_features_sup + random_features)])

In [253]:
# need to impute rows where has_... is null
imputer = SimpleImputer(strategy="most_frequent") 

for target in inferred_features_sup:
    X_train = train_df.drop(columns=inferred_features_sup)
    X_test = test_df.drop(columns=inferred_features_sup)
    y_train = train_df[target]
    y_test = test_df[target]

    X_train_imp = imputer.fit_transform(X_train)
    X_test_imp = imputer.transform(X_test)
    
    model = Ridge()
    model.fit(X_train_imp, y_train)
    print(target, "supervised:", model.score(X_test_imp, y_test))
    
    unique, counts = np.unique(y_train, return_counts=True)
    unique = [int(x) for x in unique]
    rv = rv_discrete(a=np.min(unique), b=np.max(unique), values=(unique, counts/np.sum(counts)))
    
    preds = []
    for i in range(len(y_test)):
        preds.append(rv.rvs())
    
    print(target, "probability-based:", r2_score([int(x) for x in y_test], preds))

overall_score supervised: 0.5675662340771537
overall_score probability-based: -1.0430560728301854
intelligence_score supervised: 0.2269467190524267
intelligence_score probability-based: -1.038794040487418
strength_score supervised: 0.5022075891152257
strength_score probability-based: -0.985651908940659
speed_score supervised: 0.49026532173096216
speed_score probability-based: -0.8833496466017661
durability_score supervised: 0.5444866725037811
durability_score probability-based: -0.8691866607150045
power_score supervised: 0.5414415172407406
power_score probability-based: -1.1615845289809292
combat_score supervised: 0.2816989846446305
combat_score probability-based: -1.3478054636346144
