# Advanced Transformation Core
Susan Shin

In [1]:
import pandas as pd
import numpy as np
import os, json

In [2]:
info = pd.read_csv('Data/superhero_info - superhero_info.csv')
info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [3]:
info[['hero_names','publisher']] = info['Hero|Publisher'].str.split('|', expand=True)
info = info.drop(columns = 'Hero|Publisher')
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,hero_names,publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [4]:
ms = info.loc[0,"Measurements"]
print(type(ms))
ms

<class 'str'>


"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"

In [5]:
ms = ms.replace("'",'"')
ms

'{"Height": "203.0 cm", "Weight": "441.0 kg"}'

In [6]:
import json
ms_fixed = json.loads(ms)
print(type(ms_fixed))
ms_fixed

<class 'dict'>


{'Height': '203.0 cm', 'Weight': '441.0 kg'}

In [7]:
## use .str.replace to replace all single quotes
info['Measurements'] = info['Measurements'].str.replace("'",'"')
## Apply the json.loads to the full column
info['Measurements'] = info['Measurements'].apply(json.loads)
info['Measurements'].head()

0    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
1     {'Height': '191.0 cm', 'Weight': '65.0 kg'}
2     {'Height': '185.0 cm', 'Weight': '90.0 kg'}
3    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
4    {'Height': '193.0 cm', 'Weight': '122.0 kg'}
Name: Measurements, dtype: object

In [8]:
height_weight = info['Measurements'].apply(pd.Series)
height_weight

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [9]:
info = pd.concat((info, height_weight), axis = 1)
info.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,hero_names,publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg


In [10]:
info = info.drop(columns=['Measurements'])
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero_names,publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg


In [11]:
## save the 2 new columns into the dataframe
info[['Height (in CM)','cm']] = info['Height'].str.split(' ',expand=True)
info[['Weight (in KG)','kg']] = info['Weight'].str.split(' ',expand=True)
info.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero_names,publisher,Height,Weight,Height (in CM),cm,Weight (in KG),kg
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg,203.0,cm,441.0,kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg,191.0,cm,65.0,kg


In [12]:
## drop the original column 
info = info.drop(columns=['Height', 'Weight', 'cm', 'kg'])
info.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero_names,publisher,Height (in CM),Weight (in KG)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0


In [13]:
info['Height (in CM)'] = info['Height (in CM)'].astype(float)
info['Weight (in KG)'] = info['Weight (in KG)'].astype(float)
info.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero_names,publisher,Height (in CM),Weight (in KG)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0


In [14]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          463 non-null    object 
 1   Race            463 non-null    object 
 2   Alignment       463 non-null    object 
 3   Hair color      463 non-null    object 
 4   Eye color       463 non-null    object 
 5   Skin color      463 non-null    object 
 6   hero_names      463 non-null    object 
 7   publisher       463 non-null    object 
 8   Height (in CM)  463 non-null    float64
 9   Weight (in KG)  463 non-null    float64
dtypes: float64(2), object(8)
memory usage: 36.3+ KB


In [15]:
powers = pd.read_csv('Data/superhero_powers - superhero_powers.csv')
powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


In [16]:
df = pd.merge(powers, info, how='inner', on = 'hero_names')
df.head()

Unnamed: 0,hero_names,Powers,Gender,Race,Alignment,Hair color,Eye color,Skin color,publisher,Height (in CM),Weight (in KG)
0,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Male,Human,good,No Hair,yellow,Unknown,Marvel Comics,203.0,441.0
1,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...",Male,Icthyo Sapien,good,No Hair,blue,blue,Dark Horse Comics,191.0,65.0
2,Abin Sur,Lantern Power Ring,Male,Ungaran,good,No Hair,blue,red,DC Comics,185.0,90.0
3,Abomination,"Accelerated Healing,Intelligence,Super Strengt...",Male,Human / Radiation,bad,No Hair,green,Unknown,Marvel Comics,203.0,441.0
4,Absorbing Man,"Cold Resistance,Durability,Energy Absorption,S...",Male,Human,bad,No Hair,blue,Unknown,Marvel Comics,193.0,122.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 463 entries, 0 to 462
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   hero_names      463 non-null    object 
 1   Powers          463 non-null    object 
 2   Gender          463 non-null    object 
 3   Race            463 non-null    object 
 4   Alignment       463 non-null    object 
 5   Hair color      463 non-null    object 
 6   Eye color       463 non-null    object 
 7   Skin color      463 non-null    object 
 8   publisher       463 non-null    object 
 9   Height (in CM)  463 non-null    float64
 10  Weight (in KG)  463 non-null    float64
dtypes: float64(2), object(9)
memory usage: 43.4+ KB


In [18]:
df["Powers"].head()

0    Accelerated Healing,Durability,Longevity,Super...
1    Agility,Accelerated Healing,Cold Resistance,Du...
2                                   Lantern Power Ring
3    Accelerated Healing,Intelligence,Super Strengt...
4    Cold Resistance,Durability,Energy Absorption,S...
Name: Powers, dtype: object

In [19]:
df_pwrs = df["Powers"].str.split(",")

In [20]:
df_pwrs.head()

0    [Accelerated Healing, Durability, Longevity, S...
1    [Agility, Accelerated Healing, Cold Resistance...
2                                 [Lantern Power Ring]
3    [Accelerated Healing, Intelligence, Super Stre...
4    [Cold Resistance, Durability, Energy Absorptio...
Name: Powers, dtype: object

In [21]:
exploded = df_pwrs.explode("Powers")
exploded.head()

0    Accelerated Healing
1             Durability
2              Longevity
3         Super Strength
4                Stamina
Name: Powers, dtype: object

In [22]:
## saving the unique values from the exploded column
cols_to_make = exploded.dropna().unique()
cols_to_make

array(['Accelerated Healing', 'Durability', 'Longevity', 'Super Strength',
       'Stamina', 'Camouflage', 'Self-Sustenance', 'Agility',
       'Cold Resistance', 'Underwater breathing', 'Marksmanship',
       'Weapons Master', 'Intelligence', 'Telepathy', 'Immortality',
       'Reflexes', 'Enhanced Sight', 'Sub-Mariner', 'Lantern Power Ring',
       'Super Speed', 'Invulnerability', 'Animation', 'Super Breath',
       'Energy Absorption', 'Elemental Transmogrification',
       'Fire Resistance', 'Natural Armor', 'Molecular Manipulation',
       'Heat Resistance', 'Matter Absorption', 'Stealth', 'Flight',
       'Power Suit', 'Energy Blasts', 'Energy Beams', 'Power Cosmic',
       'Heat Generation', 'Danger Sense', 'Teleportation', 'Phasing',
       'Force Fields', 'Hypnokinesis', 'Energy Manipulation',
       'Invisibility', 'Enhanced Senses', 'Jump', 'Substance Secretion',
       'Natural Weapons', 'Wallcrawling', 'Vision - Thermal',
       'Power Augmentation', 'Cryokinesis', 'Dupli

In [23]:
for col in cols_to_make:
    df[col] = df["Powers"].str.contains(col)
df.head()

  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] = df["Powers"].str.contains(col)
  df[col] =

Unnamed: 0,hero_names,Powers,Gender,Race,Alignment,Hair color,Eye color,Skin color,publisher,Height (in CM),...,Hair Manipulation,Weather Control,Nova Force,Odin Force,Phoenix Force,Power Sense,Qwardian Power Ring,Melting,Changing Armor,Terrakinesis
0,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Male,Human,good,No Hair,yellow,Unknown,Marvel Comics,203.0,...,False,False,False,False,False,False,False,False,False,False
1,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...",Male,Icthyo Sapien,good,No Hair,blue,blue,Dark Horse Comics,191.0,...,False,False,False,False,False,False,False,False,False,False
2,Abin Sur,Lantern Power Ring,Male,Ungaran,good,No Hair,blue,red,DC Comics,185.0,...,False,False,False,False,False,False,False,False,False,False
3,Abomination,"Accelerated Healing,Intelligence,Super Strengt...",Male,Human / Radiation,bad,No Hair,green,Unknown,Marvel Comics,203.0,...,False,False,False,False,False,False,False,False,False,False
4,Absorbing Man,"Cold Resistance,Durability,Energy Absorption,S...",Male,Human,bad,No Hair,blue,Unknown,Marvel Comics,193.0,...,False,False,False,False,False,False,False,False,False,False


# Questions

## Compare the average weight of super powers who have Super Speed to those who do not.

In [24]:
pd.set_option('display.max_column', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', -1)
pd.set_option('display.max_rows', None)

In [25]:
df_weight = round(df.groupby("Super Speed")["Weight (in KG)"].mean(),1)
df_weight

Super Speed
False    101.8
True     129.4
Name: Weight (in KG), dtype: float64

Heros with super speed weigh 129.4 compared to 101.8 of those without super speed.

## What is the average height of heroes for each publisher?

In [26]:
df_group = round(df.groupby("publisher")["Height (in CM)"].mean(),1)
df_group

publisher
DC Comics            181.9
Dark Horse Comics    176.9
George Lucas         159.6
Image Comics         211.0
Marvel Comics        191.5
Shueisha             171.5
Star Trek            181.5
Team Epic TV         180.8
Unknown              178.0
Name: Height (in CM), dtype: float64

Above are the average heights for heros by publishers.