In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, time
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [97]:
# Reading csv file of superhero info
hero_info_df = pd.read_csv('Data/superhero_info - superhero_info.csv', low_memory=False)
# Getting the first 5 rows of the dataframe
hero_info_df.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [98]:
# Reading csv file of superhero powers
hero_powers_df = pd.read_csv('Data/superhero_powers - superhero_powers.csv', low_memory=False)
# Getting the first 5 rows of the dataframe
hero_powers_df.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


First I will split the `Hero|Publisher` column, into different columns to seperate the hero name from publisher name.

In [99]:
# spliting the 'Hero|Publisher' column into two columns
hero_info_df[['Hero', 'Publisher']] = hero_info_df['Hero|Publisher'].str.split('|', expand=True)
# Dropping the 'Hero|Publisher' column
hero_info_df.drop('Hero|Publisher', axis=1, inplace=True)
# Getting the first 5 rows of the dataframe
hero_info_df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


I will continue to converting the dictionary values of column measurements into columns Height and Weight.

In [100]:
# Using .str.replace() to replace the single quotes with double quotes
hero_info_df['Measurements'] = hero_info_df['Measurements'].str.replace("'", '"')
# Using .apply() to convert the strings into dictionaries
hero_info_df['Measurements'] = hero_info_df['Measurements'].apply(json.loads)

In [101]:
# Unpacking the dictionaries into columns using .apply(pd.Series) and .concat() to combine the dataframes together again and dropping the original column
hero_info_df = pd.concat([hero_info_df.drop(['Measurements'], axis=1), hero_info_df['Measurements'].apply(pd.Series)], axis=1)
# Getting the first 5 rows of the dataframe
hero_info_df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg


In [102]:
# Replacing 'cm' and 'kg' with empty strings using .str.replace()
replacing = [' cm', ' kg']
# Creating loop to loop through the list of strings
for char in replacing:
    hero_info_df['Height'] = hero_info_df['Height'].str.replace(char, '', regex=False)
    hero_info_df['Weight'] = hero_info_df['Weight'].str.replace(char, '', regex=False)

# Converting the 'Height' and 'Weight' columns to floats
hero_info_df['Height'] = hero_info_df['Height'].astype(float)
hero_info_df['Weight'] = hero_info_df['Weight'].astype(float)

# Getting the first 5 rows of the dataframe
hero_info_df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0


To complete the rest of this data processing, now I will continue to clean the other dataframe import from earlier.
- First I will start with splitting the powers column into seperate columns that will be OneHotEncoded

In [103]:
# Splitting the powers into list
hero_powers_df['Powers_split'] = hero_powers_df['Powers'].str.split(',')
# Getting the first 5 rows of the dataframe
hero_powers_df

Unnamed: 0,hero_names,Powers,Powers_split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed","[Agility, Super Strength, Stamina, Super Speed]"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...","[Accelerated Healing, Durability, Longevity, S..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...","[Agility, Accelerated Healing, Cold Resistance..."
3,Abin Sur,Lantern Power Ring,[Lantern Power Ring]
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt...","[Accelerated Healing, Intelligence, Super Stre..."
...,...,...,...
662,Yellowjacket II,"Flight,Energy Blasts,Size Changing","[Flight, Energy Blasts, Size Changing]"
663,Ymir,"Cold Resistance,Durability,Longevity,Super Str...","[Cold Resistance, Durability, Longevity, Super..."
664,Yoda,"Agility,Stealth,Danger Sense,Marksmanship,Weap...","[Agility, Stealth, Danger Sense, Marksmanship,..."
665,Zatanna,"Cryokinesis,Telepathy,Magic,Fire Control,Proba...","[Cryokinesis, Telepathy, Magic, Fire Control, ..."


In [104]:
print(type(hero_powers_df['Powers_split']))
hero_powers_df['Powers_split']

<class 'pandas.core.series.Series'>


0        [Agility, Super Strength, Stamina, Super Speed]
1      [Accelerated Healing, Durability, Longevity, S...
2      [Agility, Accelerated Healing, Cold Resistance...
3                                   [Lantern Power Ring]
4      [Accelerated Healing, Intelligence, Super Stre...
                             ...                        
662               [Flight, Energy Blasts, Size Changing]
663    [Cold Resistance, Durability, Longevity, Super...
664    [Agility, Stealth, Danger Sense, Marksmanship,...
665    [Cryokinesis, Telepathy, Magic, Fire Control, ...
666    [Super Speed, Intangibility, Time Travel, Time...
Name: Powers_split, Length: 667, dtype: object

In [105]:
hero_powers_df['Powers_split'].value_counts()

[Intelligence]                                                                                                                                                                                                                                                                          8
[Durability, Super Strength]                                                                                                                                                                                                                                                            5
[Agility, Stealth, Marksmanship, Weapons Master, Stamina]                                                                                                                                                                                                                               4
[Marksmanship]                                                                                                                                            

In [106]:
exploded = hero_powers_df.explode('Powers_split')
exploded.head()

Unnamed: 0,hero_names,Powers,Powers_split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Agility
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Strength
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Stamina
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Speed
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Accelerated Healing


In [107]:
cols_to_make = exploded['Powers_split'].dropna().unique()
cols_to_make

array(['Agility', 'Super Strength', 'Stamina', 'Super Speed',
       'Accelerated Healing', 'Durability', 'Longevity', 'Camouflage',
       'Self-Sustenance', 'Cold Resistance', 'Underwater breathing',
       'Marksmanship', 'Weapons Master', 'Intelligence', 'Telepathy',
       'Immortality', 'Reflexes', 'Enhanced Sight', 'Sub-Mariner',
       'Lantern Power Ring', 'Invulnerability', 'Animation',
       'Super Breath', 'Dimensional Awareness', 'Flight', 'Size Changing',
       'Teleportation', 'Magic', 'Dimensional Travel',
       'Molecular Manipulation', 'Energy Manipulation', 'Power Cosmic',
       'Energy Absorption', 'Elemental Transmogrification',
       'Fire Resistance', 'Natural Armor', 'Heat Resistance',
       'Matter Absorption', 'Regeneration', 'Stealth', 'Power Suit',
       'Energy Blasts', 'Energy Beams', 'Heat Generation', 'Danger Sense',
       'Phasing', 'Force Fields', 'Hypnokinesis', 'Invisibility',
       'Enhanced Senses', 'Jump', 'Shapeshifting', 'Elasticity',
 

In [108]:
for col in cols_to_make:
    hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
hero_powers_df.head()

  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(col)
  hero_powers_df[col] = hero_powers_df['Powers'].str.contains(

Unnamed: 0,hero_names,Powers,Powers_split,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed","[Agility, Super Strength, Stamina, Super Speed]",True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...","[Accelerated Healing, Durability, Longevity, S...",False,True,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...","[Agility, Accelerated Healing, Cold Resistance...",True,True,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,Lantern Power Ring,[Lantern Power Ring],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt...","[Accelerated Healing, Intelligence, Super Stre...",False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [109]:
# Dropping the 'Powers' column
hero_powers_df.drop(['Powers', 'Powers_split'], axis=1, inplace=True)
# Getting the first 5 rows of the dataframe
hero_powers_df.head()

Unnamed: 0,hero_names,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,Camouflage,Self-Sustenance,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,False,True,True,False,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,True,False,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [111]:
## Merging the two dataframes together
hero_df = pd.merge(hero_info_df, hero_powers_df, left_on='Hero', right_on='hero_names')
hero_df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0,...,False,False,False,False,False,False,False,False,False,False
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0,...,False,False,False,False,False,False,False,False,False,False
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0,...,False,False,False,False,False,False,False,False,False,False


1. Compare the average weight of super heros who have Super Speed to those who do.

In [115]:
# Getting the average weight of the heroes with power 'Super Speed'
filter = hero_df['Super Speed'] == True
# print the average weight of the heroes with power 'Super Speed'
print(f'The average weight of the heroes with power "Super Speed" is {hero_df[filter]["Weight"].mean()}')
# print the average height of the heroes without power 'Super Speed'
print(f'The average height of the heroes without power "Super Speed" is {hero_df[~filter]["Height"].mean()}')

The average weight of the heroes with power "Super Speed" is 129.40404040404042
The average height of the heroes without power "Super Speed" is 186.37622641509432


2. What is the average height of heroes for each publisher?

In [116]:
# Getting the average height of heroes for each publisher
hero_df.groupby('Publisher')['Height'].mean()

Publisher
DC Comics            181.923913
Dark Horse Comics    176.909091
George Lucas         159.600000
Image Comics         211.000000
Marvel Comics        191.546128
Shueisha             171.500000
Star Trek            181.500000
Team Epic TV         180.750000
Unknown              178.000000
Name: Height, dtype: float64