# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 9](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean9_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean8.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean8", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98059 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98057,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98058,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
40757,22,Peter Bennett,", 77, American music promoter, heart attack.",https://en.wikipedia.org/wiki/Peter_Bennett_(music_promoter),9,2012,November,,,music promoter,heart attack,,,,,,,,,77.0,,United States of America,,,2.302585,0,0,0,0,0,0,0,0,0,0,0,0,0
40696,17,Bonnie Lynn Fields,", 68, American actress and , throat cancer.",https://en.wikipedia.org/wiki/Bonnie_Lynn_Fields,3,2012,November,"Mouseketeer, ,",,,throat cancer,,,,,,,,,68.0,,United States of America,,"Mouseketeer, ,",1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1
27138,17,Sammy Baugh,", 94, American football player and member of the Pro Football Hall of Fame.",https://en.wikipedia.org/wiki/Sammy_Baugh,31,2008,December,Washington Redskins,,,,,,,,,,,,94.0,,United States of America,,Washington Redskins,3.465736,0,0,0,0,0,0,1,0,0,0,0,0,1
38584,20,Eugene Polley,", 96, American engineer, inventor of the wireless TV remote control.",https://en.wikipedia.org/wiki/Eugene_Polley,7,2012,May,,,,inventor of the wireless TV remote control,,,,,,,,,96.0,,United States of America,,,2.079442,1,0,0,0,0,0,0,0,0,0,0,0,1
57678,31,Georges Cottier,", 93, Swiss Roman Catholic cardinal.",https://en.wikipedia.org/wiki/Georges_Cottier,10,2016,March,,,Catholic cardinal,,,,,,,,,,93.0,,Switzerland,Italy,,2.397895,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98059 entries, 0 to 98058
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98059 non-null  object 
 1   name                       98059 non-null  object 
 2   info                       98059 non-null  object 
 3   link                       98059 non-null  object 
 4   num_references             98059 non-null  int64  
 5   year                       98059 non-null  int64  
 6   month                      98059 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98027 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [69]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [68]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [67]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "cartoonist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [66]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [70]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and fundamentalist" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [71]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "World War II cartoonist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [72]:
# Creating lists for each category
politics_govt_law = [
    "satirical and political",
]

arts = [
    'cartoonist Known for the biplane flying overhead trailing a banner that read "mild',
    "editorial cartoonist and the creator of the comic strip",
    "comic book and comic strip cartoonist and illustrator",
    "editorial cartoonist and prop and set designer",
    "underground cartoonist and fetish art pioneer",
    "children book illustrator and cartoonist",
    "cartoonist and television personality",
    "cartoonist who created the comic book",
    "television producer and cartoonist",
    "cartoonist and newspaper columnist",
    "Pulitzer Prize winning cartoonist",
    "cartoonist and fundamentalist",
    "cartoonist for The New Yorker",
    "cartoonist and puzzle creator",
    "underground comix cartoonist",
    "cartoonist and caricaturist",
    "illustrator and cartoonist",
    "cartoonist and illustrator",
    "cartoonist and publisher",
    "animator and cartoonist",
    "cartoonist and animator",
    "cartoonist and designer",
    "World War II cartoonist",
    "underground cartoonist",
    "comic strip cartoonist",
    "comic  and cartoonist",
    "editorial cartoonist",
    "animation cartoonist",
    "newspaper cartoonist",
    "manga cartoonist",
    "cartoonist and",
    "and cartoonist",
    "cartoonist",
]
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [73]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [74]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 17.5 s
Wall time: 17.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
76492,31,Michael Lindsay,", 56, American voice actor .",https://en.wikipedia.org/wiki/Michael_Lindsay,3,2019,August,", ,",,,,,,,,,,,,56.0,,United States of America,,", ,",1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1
42975,20,Deanna Durbin,", 91, Canadian singer and actress .",https://en.wikipedia.org/wiki/Deanna_Durbin,35,2013,April,,,,,,,,,,,,,91.0,,Canada,,,3.583519,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [75]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 32388 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [144]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [143]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [142]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "film producer" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [141]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [146]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "film producer and distributor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [147]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "Broadway and film producer of musicals",
    "film producer and assistant director",
    "film producer and record executive",
    "film producer and studio executive",
    "cinematographer and film producer",
    "film producer and mother of Sarah",
    "talent manager and film producer",
    "film producer and talent manager",
    "broadcaster and film producer",
    "film producer and distributor",
    "film producer and founder of",
    "television and film producer",
    "film producer and presenter",
    "film producer and executive",
    "pornographic film producer",
    "Broadway and film producer",
    "film producer and director",
    "theater and film producer",
    "theatre and film producer",
    "independent film producer",
    "documentary film producer",
    "stage and film producer",
    "Bollywood film producer",
    "animated film producer",
    "horror film producer",
    "TV and film producer",
    "LGBT film producer",
    "and film producer",
    "film producer and",
    "film producer",
]
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [148]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [149]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 15.6 s
Wall time: 15.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
78974,27,Jason Polan,", 37, American artist and illustrator, cancer.",https://en.wikipedia.org/wiki/Jason_Polan,11,2020,January,,,,cancer,,,,,,,,,37.0,,United States of America,,,2.484907,0,0,0,0,0,1,0,0,0,0,0,0,1
88075,1,Jahmil French,", 29, Canadian actor .",https://en.wikipedia.org/wiki/Jahmil_French,5,2021,March,", ,",,,,,,,,,,,,29.0,,Canada,,", ,",1.791759,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [150]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 32146 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [152]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [223]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [209]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "football coach" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [262]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [253]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "football coach and executive" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [258]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "college football coach and administrator"]

<IPython.core.display.Javascript object>

In [213]:
# Dropping entry for relation of individual whose page link points to
index = df[df["link"] == "https://en.wikipedia.org/wiki/Lola_Wasserstein"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [260]:
# Creating lists for each category
politics_govt_law = []

arts = [
    'playwright and lyricist of "Arrivederci Roma" and other songs',
    "film and theater director and playwright",
    "Pulitzer Prize winning playwright",
    "theatre producer and playwright",
    "theatre director and playwright",
    "playwright and theatre director",
    "theatre critic and playwright",
    "film lyricist and playwright",
    "playwright and broadcaster",
    "playwright and TV producer",
    "dramatist and playwright",
    "filmmaker and playwright",
    "playwright and dramatist",
    "playwright and memoirist",
    "playwright and humorist",
    "lyricist and playwright",
    "playwright and lyricist",
    "comedian and playwright",
    "playwright and director",
    "Assiniboine playwright",
    "playwright and theatre",
    "playwright and critic",
    "playwright and",
    "and playwright",
    "playwright",
]
sports = [
    "three time Super Bowl winning football coach of the San Francisco ers and member of the Pro Football Hall of Fame",
    "football coach and member of the Pro Football Hall of Fame",
    "Hall of Fame football coach and administrator",
    "college football coach and athletic director",
    "football coach at Amherst College for years",
    "football coach and athletic administrator",
    "college football coach and administrator",
    "National Football League football coach",
    "high school and college football coach",
    "college and high school football coach",
    "Northwestern University football coach",
    "head football coach at Ithaca College",
    "football coach for Indiana University",
    "college football coach and NFL player",
    "football coach and athletic director",
    "Hall of Fame college football coach",
    "football coach and administrator",
    "former national football coach",
    "football coach and team owner",
    "football coach and executive",
    "Hall of Fame football coach",
    "NCAA and CFL football coach",
    "high school football coach",
    "football coach and manager",
    "association football coach",
    "football coach and player",
    "football coach of West y",
    "gridiron football coach",
    "college football coach",
    "East football coach",
    "football coach and",
    "and football coach",
    "football coach",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [263]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [264]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 29.8 s
Wall time: 29.8 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
72701,16,Unto Wiitala,", 93, Finnish Hall of Fame ice hockey player and official.",https://en.wikipedia.org/wiki/Unto_Wiitala,4,2019,January,,,,,,,,,,,,,93.0,,Finland,,,1.609438,0,0,0,0,0,0,1,0,0,0,0,0,1
79765,5,Shirley Cowles,", 80, New Zealand cricketer .",https://en.wikipedia.org/wiki/Shirley_Cowles,5,2020,March,national team,,,,,,,,,,,,80.0,,New Zealand,,national team,1.791759,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [265]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 31674 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [386]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [385]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [384]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "archaeologist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [383]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [381]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "osteologist" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [380]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "nuclear archaeologist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [378]:
# Creating lists for each category
politics_govt_law = [
    "civil servant and United Nations disarmament official",
    "colonial civil servant and governor Ruanda Urundi",
    "Coast Salish tribal leader and civil servant",
    "civil servant and mental health campaigner",
    "civil servant and countryside campaigner",
    "civil servant and presidential advisor",
    "civil servant and policy advisor",
    "and international civil servant",
    "civil servant and whistleblower",
    "Channel Islander civil servant",
    "civil servant from Providence",
    "civil servant and magistrate",
    "prosecutor and civil servant",
    "civil servant and bureaucrat",
    "international civil servant",
    "barrister and civil servant",
    "life peer and civil servant",
    "civil servant and planner",
    "civil servant and public",
    "civil servant and peer",
    "and civil servant",
    "civil servant and",
    "civil servant",
]

arts = []
sports = []
sciences = [
    "osteologist",
]

business_farming = []
academia_humanities = [
    "museum curator and archaeologist who was director of the Museum of",
    "archaeologist specializing in Etruscan civilization and art",
    "archaeologist at the University of Sheffield",
    "Middle Eastern archaeologist and professor",
    "archaeologist and art preservationist",
    "classical scholar and archaeologist",
    "archaeologist and classical scholar",
    "archaeologist at the University of",
    "archaeologist of sites in Anatolia",
    "art conservator and archaeologist",
    "Coptic scholar and archaeologist",
    "anthropologist and archaeologist",
    "archaeologist and anthropologist",
    "archaeologist and Byzantinist",
    "archaeologist and numismatist",
    "archaeologist and philologist",
    "archaeologist and classicist",
    "archaeologist and sinologist",
    "archaeologist and professor",
    "classical archaeologist and",
    "archaeologist and curator",
    "archaeologist and scholar",
    "historical archaeologist",
    "underwater archaeologist",
    "classical archaeologist",
    "landscape archaeologist",
    "nuclear archaeologist",
    "Meso archaeologist",
    "zooarchaeologist",
    "archaeologist and",
    "and archaeologist",
    "archaeologist",
]
law_enf_military_operator = [
    "recruiter of spies",
]
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [379]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [387]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['academia_humanities'] ==1].sample(2)

CPU times: total: 29.5 s
Wall time: 29.5 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
57788,7,Cyril Edel Leonoff,", 91, Canadian civil engineer and historian.",https://en.wikipedia.org/wiki/Cyril_Edel_Leonoff,9,2016,April,,,,,,,,,,,,,91.0,,Canada,,,2.302585,1,0,0,1,0,0,0,0,0,0,0,0,2
12863,2,Zygmunt Milczewski,", 95, Polish historian and resistance fighter during World War II.",https://en.wikipedia.org/wiki/Zygmunt_Milczewski,3,2001,June,,,resistance fighter during World War II,,,,,,,,,,95.0,,Poland,,,1.386294,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [388]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 31260 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [513]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [511]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [510]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "racing driver" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [509]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [514]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "racing driver and executive coach"]

<IPython.core.display.Javascript object>

In [469]:
# Hard-coding cause_of_death for entry with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Roy_Barraclough"].index
df.loc[index, "cause_of_death"] = "short illness"

# Hard-coding engineer for entry for correct categorization
index = df[df["link"] == "https://en.wikipedia.org/wiki/Archie_Butterworth"].index
df.loc[index, "info_3"] = "engineer"

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [515]:
# Creating lists for each category
politics_govt_law = []

arts = [
    'comedian dubbed "Uncle Miltie" and "Mr Television"',
    "Grammy winning comedian and JFK impersonator",
    "comedian and reality television contestant",
    "comedian and classical music satirist",
    "comedian and television personality",
    "comedian and television presenter",
    "music hall performer and comedian",
    "television producer and comedian",
    "theater director and comedian",
    "comedian respiratory problems",
    "radio presenter and comedian",
    "television host and comedian",
    "comedian and television host",
    "Oneida Mohawk Cree comedian",
    "comedian and opera director",
    "comedian and game show host",
    "comedian and impressionist",
    "ventriloquist and comedian",
    "TV celebrity and comedian",
    "comedian and entertainer",
    "entertainer and comedian",
    "broadcaster and comedian",
    "improvisational comedian",
    "Wiere Brothers comedian",
    "comedian and radio host",
    "comedian and satirist",
    "Borscht Belt comedian",
    "comedian and pianist",
    "ice skating comedian",
    "comedian and dancer",
    "xiangsheng comedian",
    "East comedian",
    "film comedian",
    "and comedian",
    "comedian and",
    "comedian",
]
sports = [
    "racing driver and last living participant in the first F World Championship race",
    "Grand Prix motorcycle road racer and racing driver",
    "former Grand Prix racing driver and constructor",
    "racing driver and Formula One team principal",
    "racing driver and motorsport administrator",
    "Hall of Fame racing driver and instructor",
    "record setting harness racing driver",
    "motor racing driver and team leader",
    "racing driver and racing team owner",
    "Hall of Fame harness racing driver",
    "racing driver and car constructor",
    "racing driver and team manager",
    "racing driver and rally driver",
    "racing driver and instructor",
    "racing driver and team owner",
    "Hall of Fame racing driver",
    "racing driver and designer",
    "Indianapolis racing driver",
    "professional racing driver",
    "and amateur racing driver",
    "Formula One racing driver",
    "Grand Prix racing driver",
    "open wheel racing driver",
    "sprint car racing driver",
    "stock car racing driver",
    "off road racing driver",
    "racing driver champion",
    "amateur racing driver",
    "motor racing driver",
    "auto racing driver",
    "drag racing driver",
    "VLN racing driver",
    "racing driver and",
    "and racing driver",
    "racing driver",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [516]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [517]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 36.7 s
Wall time: 36.7 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
48945,13,Josh Liavaa,", 65, Tongan-born New Zealand rugby league player , shot.",https://en.wikipedia.org/wiki/Josh_Liavaa,8,2014,July,national team,,,shot,,,,,,,,,65.0,,Tonga,New Zealand,national team,2.197225,0,0,0,0,0,0,1,0,0,0,0,0,1
29763,16,Antonio de Nigris,", 31, Mexican football player, heart failure.",https://en.wikipedia.org/wiki/Antonio_de_Nigris,5,2009,November,,,,heart failure,,,,,,,,,31.0,,Mexico,,,1.791759,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [518]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 30808 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [940]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [939]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [937]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "art" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [936]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [920]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "martial" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [941]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "art instructor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [942]:
# Creating lists for each category
politics_govt_law = [
    "first female major party candidate to run for Governor of North Carolina",
    "Principal Speaker of Green Party of and and Oxford City Councillor",
    "who trained Martin Luther King Jr and other civil rights leaders",
    "chairman of the Communist Party of and International Brigades",
    "presidential candidate for the Alliance for Democracy party",
    "threatened by McCarthyism and championed by Edward R Murrow",
    "Democratic Party member of the House of Representatives",
    "labor leader and parliamentarian for the Peronist party",  # before arts
    "convert and partner of nationalist Subhas Chandra Bose",
    "assistant secretary of the Department of Commerce and",
    "trade unionist and member of the Communist Party of",
    "presidential candidate for the Libertarian Party",
    "leader of the Islamic Renaissance Party of istan",
    "labor leader and chairman of the Communist Party",
    "first female chair of Michigan Republican Party",
    "peer and proponent of the Hollow Earth concept",
    "Secretary of Treasury from under Jimmy Carter",
    "Party student leader and later a neo figure",
    "political leader of the Communist Party of",
    "Press Secretary for President Jimmy Carter",
    "and romantic partner to Princess Margaret",
    "leader of the Progressive Unionist Party",
    "Green Party member of the House of Lords",
    "leader of the Black Panther Party",
    "leader of Beijing Communist Party",
    "member of the Bonaparte dynasty",
    "and anti apartheid campaigner",
    "State Department official and",
    "Conservative Party chairman",
    "Black Panther Party member",
    "Republican party official",
    "Communist party secretary",
    "anti apartheid campaigner",
    "anti communist partisan",
    "revolutionary thinker",
    "Yukon Party Leader",
    "Labour Party MP",
    "Communist Party",
    "party official",
    "partisan and",
    "partisan",
]

arts = [
    "last living member of the Carter Family country music group",
    "former bass player for Tom Petty and the Heartbreakers",
    "violinist and founder of the Amadeus Quartet",
    "animator for Betty Boop and Superman cartoons",
    "daredevil and flat Earth conspiracy theorist",
    "art collector and friend of Pablo Picasso",
    "film art director and production designer",
    "film production designer and art director",
    "arts administrator and opera director",
    "television producer and art collector",
    "art dealer and wife of Marcel Duchamp",
    "abstract expressionist art collector",
    "production designer and art director",
    "art director and production designer",
    "music critic and arts administrator",
    "Academy Awards winning art director",
    "bassist for the Modern Jazz Quartet",
    "arts administrator and music critic",
    "fashion designer and art collector",
    "filmmaker and pioneer of video art",
    "film set designer and art director",
    "art director and costume designer",
    "graphic designer and art director",
    "art publisher and magazine editor",
    "costume designer and art director",
    "horticulturalist and arts patron",
    "art critic and opera librettist",
    "art collector and gallery owner",
    "art dealer and gallery director",
    "Yup'ik artisan and craftswoman",
    "reality television participant",
    "art director ; designer of the",
    "essayist and visual art critic",
    "who specialized in Coptic art",
    "art director and set designer",
    "set designer and art director",
    "Māori performing arts leader",
    "performing arts producer and",
    "psychiatric patient and art",
    "advertiser and art director",
    "art critic and radio host",
    "art director and designer",
    "media executive and arts",
    "art collector and patron",
    "stylist and art director",
    "film fight choreographer",
    "international art dealer",
    "theatre director and art",
    "art  Picasso biographer",
    "art and antiques dealer",
    "advocate of Native arts",
    "art theatre personality",
    "literary editor and art",
    "of art and art history",
    "and patron of the arts",
    "heiress and art patron",
    "watercolorist and art",
    "art  gallery director",
    "artisanal cheesemaker",
    "supporter of the arts",
    "modern art collector",
    "fine arts collector",
    "born art dealer and",
    "Republic art critic",
    "film and art critic",
    "biographer and art",
    "art critic and art",
    "patron of the arts",
    "arts administrator",
    "art gallery owner",
    "art collector and",
    "and art collector",
    "and arts patron",
    "art  art critic",
    "arts campaigner",
    "and art critic",
    "indigenous art",
    "art critic and",
    "art patron and",
    "art dealer and",
    "arts executive",
    "art  collector",
    "arts advocate",
    "art gallerist",
    "art director",
    "art designer",
    "art heiress",
    "arts leader",
    "art dealer",
    "arts patron",
    "art critic",
    "art patron",
    "art expert",
    "bartender",
    "and arts",
    "of arts",
    "artisan",
    "arts",
    "and art",
    "art and",
    "art",
]
sports = [
    "baseball memorabilia collector and limited partner for the Yankees",  # before arts
    "quarterback for UCLA Bruins football and the Baltimore Colts",
    "mixed martial arts fighter and UFC middleweight champion",
    "NASCAR stock car driver and partial team owner",
    "Hall of Fame Quarter Horse equestrian",
    "Football League quarterback and coach",
    "wrestler; patriarch of Hart wrestling",
    "football quarterback and halfback",
    "part owner of the Atlanta Braves",
    "mixed martial arts competitor",
    "partially quadriplegic sailor",
    "mixed martial arts fighter",
    "mixed martial arts trainer",
    "Hall of Fame darts player",
    "mixed martial art fighter",
    "baseball team part owner",
    "football quarterback and",
    "martial arts grandmaster",
    "martial arts master and",
    "martial arts instructor",
    "sports team part owner",
    "and Rangers part owner",
    "football quarterback",
    "martial arts teacher",
    "martial arts master",
    "martial arts expert",
    "darts administrator",
    "woman darts player",
    "darts player",
    "martial arts",
    "martial",
    "darts",
]
sciences = [
    "cardiothoracic surgeon who refined John Gibbon heart lung bypass machine",
    "software designer and a pioneer in artificial intelligence and robotics",
    "geomorphologist and expert on the geology of Earth and Mars",
    "doctor and inventor of the Dodrill GMR heart machine",
    "independent researcher in artificial intelligence",
    "professor of geology and earthquake expert",
    "surgeon and open heart surgery pioneer",
    "researcher in artificial intelligence",
    "astronomer at Swarthmore College",
    "geographer and cartographer",
    "cartographer and geographer",
    "heart surgeon and inventor",
    "oceanographic cartographer",  # before arts
    "pioneering heart surgeon",
    "paediatric heart surgeon",
    "and cartographer",
    "cartographer and",
    "heart surgeon",
    "cartographer",
]

business_farming = [
    "businesswoman; chairperson of the Clerys department store",
    "billionaire department store and hotel owner",
    "and son of Wal Mart founder Sam Walton",
    "widow of Wal Mart founder Sam Walton",
    "transportation and parts executive",
    "industrial parts executive",
    "owner of Dart Drugs Chain",  # before arts
    "and chartered accountant",
    "auto parts executive",
    "chartered accountant",
    "CEO of Earthlink",
]
academia_humanities = [
    "professor at George Mason University",
    "and professor of Asian art history",  # before arts
    "art conservator",
]
law_enf_military_operator = [
    "World War I soldier and the last surviving soldier to have taken part in the Christmas truce of",
    "Deputy Director of the Department of State Office of Security",
    "police commissioner of the Philadelphia Police Department",
    "Marine Corps sergeant and recipient of nine purple hearts",  # before arts
    "World War II Jewish partisan fighter and anti avenger",
    "leader of the Bielski partisans during World War II",
    "Secretary of the Navy in the Carter administration",
    "Wehrmacht artillery and General Staff officer",
    "Navy Chief Quartermaster and aquanaut",
    "and World War II partisan fighter",
    "partisan and liaison officer",
    "partisan during World War II",
    "Sparta Battalion commander",
    "World War II partisan and",
    "wartime firefighter",
    "wartime codebreaker",
    "and quartermaster",
    "wartime commander",
    "Jewish partisan",
    "wartime spy",
    "Partisan",
    "partisan",
]
spiritual = [
    "priest who participated in the exorcism on which was based",
    "Anglican priest and Dean of Hobart from to",
    "young earth creationist leader",  # before arts
    "young Earth creationist",
    "Young Earth creationist",
    "Carthusian monk",
    "flat earther",
]
social = []
crime = [
    "suspected drug lord and high ranking leader of the Sinaloa Cartel",
    "convict and assassin of Martin Luther King Jr",
    "drug trafficker for the Sinaloa Cartel",
    "who assassinated official Wilhelm Kube",
    "narco and member of the Cali Cartel",
    "drug lord and Sinaloa Cartel leader",
    "reputed organized crime figure and",
    "member of the Detroit Partnership",  # before arts
    "part of Watergate scandal",
    "Neturei Karta rabbi and",
    "convicted art thief",
    "drug cartel leader",
    "cartel leader",
]
event_record_other = [
    "heart and lung patient whose wrong transplant made headlines",
    "recipient of the first artificial heart transplant",  # before arts
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [943]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
    "arts": arts,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [944]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 2min 12s
Wall time: 2min 12s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
57026,18,Yūko Tsushima,", 68, Japanese author.",https://en.wikipedia.org/wiki/Y%C5%ABko_Tsushima,23,2016,February,,,,,,,,,,,,,68.0,,Japan,,,3.178054,0,0,0,0,0,1,0,0,0,0,0,0,1
3662,8,Mercer Ellington,", 76, American musician, composer, and arranger, heart attack.",https://en.wikipedia.org/wiki/Mercer_Ellington,12,1996,February,,,,composer,and arranger,heart attack,,,,,,,76.0,,United States of America,,,2.564949,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [945]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 30451 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1053]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1052]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [1051]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "entrepreneur" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1049]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [975]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "Internet" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [1050]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "entrepreneur and ceremonial officer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1054]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = []
sciences = []

business_farming = [
    "entrepreneur and one of the founders of commercial TV broadcasting in the",
    "entrepreneur and the founder of Peet Coffee & Tea",
    "entrepreneur and joint founder of Ryanair",
    "entrepreneur and founder of The Body Shop",
    "entrepreneur and pioneer of microlending",
    "entrepreneur and real estate developer",
    "entrepreneur and founder of Herbalife",
    "industrial and financial entrepreneur",
    "entrepreneur and ceremonial officer",
    "real estate broker and entrepreneur",
    "entrepreneur and president of Fiat",
    "entrepreneur and business magnate",
    "fashion retailer and entrepreneur",
    "entrepreneur and founder of Naza",
    "angel investor and entrepreneur",
    "billionaire mining entrepreneur",
    "entrepreneur and industrialist",
    "businesswoman and entrepreneur",
    "entrepreneur and co founder of",
    "entrepreneur and billionaire",
    "opinion polling entrepreneur",
    "oil and mining entrepreneur",
    "entrepreneur and winemaker",
    "entrepreneur and executive",
    "cocktail bar entrepreneur",
    "computer entrepreneur and",
    "real estate entrepreneur",
    "marketing entrepreneur",
    "financial entrepreneur",
    "mini golf entrepreneur",
    "entrepreneur in Darwin",
    "property entrepreneur",
    "aviation entrepreneur",
    "tourism entrepreneur",
    "Arabian entrepreneur",
    "travel entrepreneur",
    "timber entrepreneur",
    "serial entrepreneur",
    "wine entrepreneur",
    "ATM entrepreneur",
    "and entrepreneur",
    "entrepreneur and",
    "entrepreneur",
]
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = [
    "wrongfully convicted of murder",
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1055]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [1056]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['business_farming'] ==1].sample(2)

CPU times: total: 22.3 s
Wall time: 22.3 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
56891,10,Phil Gartside,", 63, English businessman and football chairman , cancer.",https://en.wikipedia.org/wiki/Phil_Gartside,8,2016,February,Bolton Wanderers,,football chairman,cancer,,,,,,,,,63.0,,United Kingdom of Great Britain and Northern Ireland,,Bolton Wanderers,2.197225,0,0,0,0,1,0,0,0,0,0,0,0,1
95101,6,Bob Falkenburg,", 95, American Hall of Fame tennis player and businessman, founder of Bob's.",https://en.wikipedia.org/wiki/Bob_Falkenburg,24,2022,January,,,Hall of Fame tennis player,founder of Bob,,,,,,,,,95.0,,United States of America,,,3.218876,0,0,0,0,1,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [1057]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 30254 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1142]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1141]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [1140]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "soldier" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1139]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [1143]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "child soldier"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1144]:
# Creating lists for each category
politics_govt_law = [
    "Her Majesty Representative at Ascot",
]

arts = []
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = [
    "soldier during World War II and recipient of the Victoria Cross",
    "soldier and member of a Special Forces Airborne Reserve Unit",
    "Army soldier awarded the Medal of Honor during World War II",
    "Army World War II soldier who received the Medal of Honor",
    "soldier and Medal of Honor recipient for actions in War",
    "World War II soldier portrayed by Donnie Wahlberg in",
    "World War II soldier and Legion of honour recipient",
    "Army soldier who was killed by the Taliban in istan",
    "Army soldier and a recipient of the Medal of Honor",
    "Army soldier and recipient of the Medal of Honor",
    "World War II Waffen SS soldier and Panzer ace",
    "Army soldier and recipient the Medal of Honor",
    "soldier and recipient of the Medal of Honor",
    "soldier and recipient of the Victoria Cross",
    "SAS soldier involved in the Embassy Siege",
    "Army soldier and Medal of Honor recipient",
    "World War II soldier and prisoner of war",
    "soldier and Medal of Honor recipient",
    "soldier and Victoria Cross recipient",
    "Army soldier during World War II",
    "soldier and World War I veteran",
    "soldier and airborne mechanic",
    "soldier serving in the War",
    "soldier and member of the",
    "Nagorno Karabakh soldier",
    "soldier and game warden",
    "soldier during the War",
    "Army infantry soldier",
    "soldier and military",
    "World War II soldier",
    "soldier in the Army",
    "Danzig soldier and",
    "Navy SEAL soldier",
    "volunteer soldier",
    "decorated soldier",
    "Army and soldier",
    "Army soldier and",
    "combat soldier",
    "Gurkha soldier",
    "rebel soldier",
    "child soldier",
    "WWII soldier",
    "army soldier",
    "Army soldier",
    "soldier and",
    "and soldier",
    "soldier",
]
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1145]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [1058]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "art" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Code to check each specific value
# specific_roles_list.pop()

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean9.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean9.db")
# df.to_sql("wp_life_expect_clean9", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 10]()