# Data Adjustments: Comic Characters

In [1]:
import pandas

In [2]:
dc_characters_df = pandas.read_csv("original-data/dc_characters.csv")

dc_characters_df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0


In [3]:
marvel_characters_df = pandas.read_csv("original-data/marvel_characters.csv")

marvel_characters_df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
0,1678,Spider-Man (Peter Parker),\/Spider-Man_(Peter_Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters,4043.0,Aug-62,1962.0
1,7139,Captain America (Steven Rogers),\/Captain_America_(Steven_Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters,3360.0,Mar-41,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",\/Wolverine_(James_%22Logan%22_Howlett),Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3061.0,Oct-74,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",\/Iron_Man_(Anthony_%22Tony%22_Stark),Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2961.0,Mar-63,1963.0
4,2460,Thor (Thor Odinson),\/Thor_(Thor_Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,2258.0,Nov-50,1950.0


## Combining the datasets

In [4]:
num_of_dc_characters = dc_characters_df["name"].nunique()
num_of_marvel_characters = marvel_characters_df["name"].nunique()
total_characters = num_of_dc_characters + num_of_marvel_characters

print(f"num_of_dc_characters: {num_of_dc_characters}")
print(f"num_of_marvel_characters: {num_of_marvel_characters}")
print(f"\ntotal_characters: {total_characters}")

num_of_dc_characters: 6896
num_of_marvel_characters: 16376

total_characters: 23272


In [5]:
comic_characters_df = pandas.concat([dc_characters_df, marvel_characters_df])

print(f"num_of_comic_characters (should match total_characters): {comic_characters_df['name'].nunique()}")
comic_characters_df.head()

num_of_comic_characters (should match total_characters): 23272


Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR,Year
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0,
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0,
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0,
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0,
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0,


## Adjusting the dataset

### Dropping and renaming columns

In [6]:
comic_characters_df = comic_characters_df.drop(columns=["page_id", "ID", "urlslug", "GSM", "YEAR", "Year"])

comic_characters_df = comic_characters_df.rename(columns={
	"name": "Name",
	"ALIGN": "Type",
	"EYE": "Eye Color",
	"HAIR": "Hair Color",
	"SEX": "Sex",
	"ALIVE": "Alive",
	"APPEARANCES": "Appearances",
	"FIRST APPEARANCE": "First Appearance"
})

comic_characters_df.head()

Unnamed: 0,Name,Type,Eye Color,Hair Color,Sex,Alive,Appearances,First Appearance
0,Batman (Bruce Wayne),Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,3093.0,"1939, May"
1,Superman (Clark Kent),Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,2496.0,"1986, October"
2,Green Lantern (Hal Jordan),Good Characters,Brown Eyes,Brown Hair,Male Characters,Living Characters,1565.0,"1959, October"
3,James Gordon (New Earth),Good Characters,Brown Eyes,White Hair,Male Characters,Living Characters,1316.0,"1987, February"
4,Richard Grayson (New Earth),Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,1237.0,"1940, April"


### Removing null values

In [7]:
comic_characters_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23272 entries, 0 to 16375
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              23272 non-null  object 
 1   Type              19859 non-null  object 
 2   Eye Color         9877 non-null   object 
 3   Hair Color        16734 non-null  object 
 4   Sex               22293 non-null  object 
 5   Alive             23266 non-null  object 
 6   Appearances       21821 non-null  float64
 7   First Appearance  22388 non-null  object 
dtypes: float64(1), object(7)
memory usage: 1.6+ MB


In [8]:
comic_characters_df = comic_characters_df.dropna()

comic_characters_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7281 entries, 0 to 15038
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              7281 non-null   object 
 1   Type              7281 non-null   object 
 2   Eye Color         7281 non-null   object 
 3   Hair Color        7281 non-null   object 
 4   Sex               7281 non-null   object 
 5   Alive             7281 non-null   object 
 6   Appearances       7281 non-null   float64
 7   First Appearance  7281 non-null   object 
dtypes: float64(1), object(7)
memory usage: 511.9+ KB


### Editing rows

In [9]:
# make the separator a comma
comic_characters_df = comic_characters_df.rename(columns={"Name": "oldName"})
comic_characters_df["oldName"] = comic_characters_df["oldName"].apply(lambda oldName: oldName.replace(" (", ",").replace(")", ""))

# separate the columns
comic_characters_df[["Name", "Identity"]] = comic_characters_df["oldName"].str.split(",", n=1, expand=True)
comic_characters_df = comic_characters_df.drop(columns=["oldName"])

comic_characters_df.head()

Unnamed: 0,Type,Eye Color,Hair Color,Sex,Alive,Appearances,First Appearance,Name,Identity
0,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,3093.0,"1939, May",Batman,Bruce Wayne
1,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,2496.0,"1986, October",Superman,Clark Kent
2,Good Characters,Brown Eyes,Brown Hair,Male Characters,Living Characters,1565.0,"1959, October",Green Lantern,Hal Jordan
3,Good Characters,Brown Eyes,White Hair,Male Characters,Living Characters,1316.0,"1987, February",James Gordon,New Earth
4,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,1237.0,"1940, April",Richard Grayson,New Earth


In [10]:
comic_characters_df["First Appearance"] = comic_characters_df["First Appearance"].apply(lambda appearance: appearance.split(", ")[0])

comic_characters_df.head()

Unnamed: 0,Type,Eye Color,Hair Color,Sex,Alive,Appearances,First Appearance,Name,Identity
0,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,3093.0,1939,Batman,Bruce Wayne
1,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,2496.0,1986,Superman,Clark Kent
2,Good Characters,Brown Eyes,Brown Hair,Male Characters,Living Characters,1565.0,1959,Green Lantern,Hal Jordan
3,Good Characters,Brown Eyes,White Hair,Male Characters,Living Characters,1316.0,1987,James Gordon,New Earth
4,Good Characters,Blue Eyes,Black Hair,Male Characters,Living Characters,1237.0,1940,Richard Grayson,New Earth


In [11]:
comic_characters_df["Type"] = comic_characters_df["Type"].apply(lambda type: type.replace(" Characters", ""))
comic_characters_df["Type"] = comic_characters_df["Type"].apply(lambda type: type.replace(" Criminals", ""))

comic_characters_df["Type"].unique()

array(['Good', 'Neutral', 'Bad', 'Reformed'], dtype=object)

In [12]:
comic_characters_df["Eye Color"] = comic_characters_df["Eye Color"].apply(lambda eye_color: eye_color.replace(" Eyes", ""))
comic_characters_df["Eye Color"] = comic_characters_df["Eye Color"].apply(lambda eye_color: eye_color.replace(" Eyeballs", ""))

comic_characters_df["Eye Color"].unique()

array(['Blue', 'Brown', 'Green', 'Purple', 'Red', 'Hazel', 'Amber',
       'Yellow', 'Black', 'Grey', 'Photocellular', 'Pink', 'White',
       'Violet', 'Gold', 'Orange', 'Variable', 'Silver', 'One Eye',
       'Magenta', 'Multiple', 'No'], dtype=object)

In [13]:
comic_characters_df["Hair Color"] = comic_characters_df["Hair Color"].apply(lambda hair_color: hair_color.replace(" Hair", ""))

comic_characters_df["Hair Color"].unique()

array(['Black', 'Brown', 'White', 'Blond', 'Red', 'Green',
       'Strawberry Blond', 'Grey', 'Silver', 'Orange', 'Purple', 'Gold',
       'Blue', 'Reddish Brown', 'Pink', 'Violet', 'Platinum Blond', 'No',
       'Bald', 'Auburn', 'Reddish Blond', 'Variable', 'Yellow',
       'Light Brown', 'Magenta'], dtype=object)

In [14]:
comic_characters_df["Sex"] = comic_characters_df["Sex"].apply(lambda sex: sex.replace(" Characters", ""))

comic_characters_df["Sex"].unique()

array(['Male', 'Female', 'Genderless', 'Genderfluid', 'Agender'],
      dtype=object)

In [15]:
comic_characters_df["Alive"] = comic_characters_df["Alive"].apply(lambda alive: True if alive == "Living Characters" else False)

comic_characters_df["Alive"].unique()

array([ True, False])

In [16]:
comic_characters_df["Appearances"] = comic_characters_df["Appearances"].apply(lambda appearances: str(appearances).split(".")[0])

comic_characters_df["Appearances"].head()

0    3093
1    2496
2    1565
3    1316
4    1237
Name: Appearances, dtype: object

In [17]:
comic_characters_df.head()

Unnamed: 0,Type,Eye Color,Hair Color,Sex,Alive,Appearances,First Appearance,Name,Identity
0,Good,Blue,Black,Male,True,3093,1939,Batman,Bruce Wayne
1,Good,Blue,Black,Male,True,2496,1986,Superman,Clark Kent
2,Good,Brown,Brown,Male,True,1565,1959,Green Lantern,Hal Jordan
3,Good,Brown,White,Male,True,1316,1987,James Gordon,New Earth
4,Good,Blue,Black,Male,True,1237,1940,Richard Grayson,New Earth


### Checking types

In [18]:
comic_characters_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7281 entries, 0 to 15038
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Type              7281 non-null   object
 1   Eye Color         7281 non-null   object
 2   Hair Color        7281 non-null   object
 3   Sex               7281 non-null   object
 4   Alive             7281 non-null   bool  
 5   Appearances       7281 non-null   object
 6   First Appearance  7281 non-null   object
 7   Name              7281 non-null   object
 8   Identity          7278 non-null   object
dtypes: bool(1), object(8)
memory usage: 519.1+ KB


In [19]:
comic_characters_df["Appearances"] = comic_characters_df["Appearances"].astype("Int64")

comic_characters_df["Appearances"].info()

<class 'pandas.core.series.Series'>
Index: 7281 entries, 0 to 15038
Series name: Appearances
Non-Null Count  Dtype
--------------  -----
7281 non-null   Int64
dtypes: Int64(1)
memory usage: 120.9 KB


In [20]:
comic_characters_df["First Appearance"] = comic_characters_df["First Appearance"].apply(lambda first_appearance: first_appearance if first_appearance.isdigit() else None)
comic_characters_df["First Appearance"] = comic_characters_df["First Appearance"].astype("Int64")

comic_characters_df["First Appearance"].info()

<class 'pandas.core.series.Series'>
Index: 7281 entries, 0 to 15038
Series name: First Appearance
Non-Null Count  Dtype
--------------  -----
2428 non-null   Int64
dtypes: Int64(1)
memory usage: 120.9 KB


In [21]:
comic_characters_df = comic_characters_df.dropna()

comic_characters_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2425 entries, 0 to 6528
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Type              2425 non-null   object
 1   Eye Color         2425 non-null   object
 2   Hair Color        2425 non-null   object
 3   Sex               2425 non-null   object
 4   Alive             2425 non-null   bool  
 5   Appearances       2425 non-null   Int64 
 6   First Appearance  2425 non-null   Int64 
 7   Name              2425 non-null   object
 8   Identity          2425 non-null   object
dtypes: Int64(2), bool(1), object(6)
memory usage: 177.6+ KB


### Reordering columns

In [22]:
comic_characters_df = comic_characters_df[["Name", "Identity", "Type", "Sex", "Eye Color", "Hair Color", "Alive", "Appearances", "First Appearance"]]

comic_characters_df.head()

Unnamed: 0,Name,Identity,Type,Sex,Eye Color,Hair Color,Alive,Appearances,First Appearance
0,Batman,Bruce Wayne,Good,Male,Blue,Black,True,3093,1939
1,Superman,Clark Kent,Good,Male,Blue,Black,True,2496,1986
2,Green Lantern,Hal Jordan,Good,Male,Brown,Brown,True,1565,1959
3,James Gordon,New Earth,Good,Male,Brown,White,True,1316,1987
4,Richard Grayson,New Earth,Good,Male,Blue,Black,True,1237,1940


## Saving the dataset

In [23]:
comic_characters_df.to_csv("comic_characters.csv", index=False)