# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 9](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean9_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean8.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean8", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98059 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98057,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98058,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
26326,29,Geoffrey Perkins,", 55, British comedy producer, writer and performer, head of comedy for BBC, road accident.",https://en.wikipedia.org/wiki/Geoffrey_Perkins,13,2008,August,,,comedy producer,writer and performer,head of comedy for BBC,road accident,,,,,,,55.0,,United Kingdom of Great Britain and Northern Ireland,,,2.639057,0,0,0,0,0,0,0,0,0,0,0,0,0
49707,7,Don Keefer,", 98, American actor .",https://en.wikipedia.org/wiki/Don_Keefer,8,2014,September,", ,",,,,,,,,,,,,98.0,,United States of America,,", ,",2.197225,0,0,0,0,0,1,0,0,0,0,0,0,1
43067,26,Tui Uru,", 87, New Zealand opera singer and broadcaster.",https://en.wikipedia.org/wiki/Tui_Uru,15,2013,April,,,,,,,,,,,,,87.0,,New Zealand,,,2.772589,0,0,0,0,0,1,0,0,0,0,0,0,1
19584,16,Marla Ruzicka,", 28, American activist and aid worker, car bombing in Iraq.",https://en.wikipedia.org/wiki/Marla_Ruzicka,11,2005,April,,,aid worker,car bombing in,,,,,,,,,28.0,,United States of America,,,2.484907,0,0,0,0,0,0,0,0,1,0,0,0,1
45857,9,Alberto Foguelman,", 90, Argentine chess master.",https://en.wikipedia.org/wiki/Alberto_Foguelman,12,2013,December,,,chess master,,,,,,,,,,90.0,,Argentina,,,2.564949,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98059 entries, 0 to 98058
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98059 non-null  object 
 1   name                       98059 non-null  object 
 2   info                       98059 non-null  object 
 3   link                       98059 non-null  object 
 4   num_references             98059 non-null  int64  
 5   year                       98059 non-null  int64  
 6   month                      98059 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98027 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "cartoonist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and fundamentalist" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [11]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "World War II cartoonist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [12]:
# Creating lists for each category
politics_govt_law = [
    "satirical and political",
]

arts = [
    'cartoonist Known for the biplane flying overhead trailing a banner that read "mild',
    "editorial cartoonist and the creator of the comic strip",
    "comic book and comic strip cartoonist and illustrator",
    "editorial cartoonist and prop and set designer",
    "underground cartoonist and fetish art pioneer",
    "children book illustrator and cartoonist",
    "cartoonist and television personality",
    "cartoonist who created the comic book",
    "television producer and cartoonist",
    "cartoonist and newspaper columnist",
    "Pulitzer Prize winning cartoonist",
    "cartoonist and fundamentalist",
    "cartoonist for The New Yorker",
    "cartoonist and puzzle creator",
    "underground comix cartoonist",
    "cartoonist and caricaturist",
    "illustrator and cartoonist",
    "cartoonist and illustrator",
    "cartoonist and publisher",
    "animator and cartoonist",
    "cartoonist and animator",
    "cartoonist and designer",
    "World War II cartoonist",
    "underground cartoonist",
    "comic strip cartoonist",
    "comic  and cartoonist",
    "editorial cartoonist",
    "animation cartoonist",
    "newspaper cartoonist",
    "manga cartoonist",
    "cartoonist and",
    "and cartoonist",
    "cartoonist",
]
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [13]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [14]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 27.4 s
Wall time: 27.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
88619,22,Pierick Houdy,", 92, French composer and musician.",https://en.wikipedia.org/wiki/Pierick_Houdy,6,2021,March,,,,,,,,,,,,,92.0,,France,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
76851,22,Masroor Jahan,", 81, Indian Urdu author.",https://en.wikipedia.org/wiki/Masroor_Jahan,11,2019,September,,,,,,,,,,,,,81.0,,India,,,2.484907,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [15]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 32388 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [16]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [17]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [18]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "film producer" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [19]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [20]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "film producer and distributor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [21]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "Broadway and film producer of musicals",
    "film producer and assistant director",
    "film producer and record executive",
    "film producer and studio executive",
    "cinematographer and film producer",
    "film producer and mother of Sarah",
    "talent manager and film producer",
    "film producer and talent manager",
    "broadcaster and film producer",
    "film producer and distributor",
    "film producer and founder of",
    "television and film producer",
    "film producer and presenter",
    "film producer and executive",
    "pornographic film producer",
    "Broadway and film producer",
    "film producer and director",
    "theater and film producer",
    "theatre and film producer",
    "independent film producer",
    "documentary film producer",
    "stage and film producer",
    "Bollywood film producer",
    "animated film producer",
    "horror film producer",
    "TV and film producer",
    "LGBT film producer",
    "and film producer",
    "film producer and",
    "film producer",
]
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [22]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [23]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 20 s
Wall time: 20 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
66946,8,Agustín Bernal,", 59, Mexican actor.",https://en.wikipedia.org/wiki/Agust%C3%ADn_Bernal,4,2018,January,,,,,,,,,,,,,59.0,,Mexico,,,1.609438,0,0,0,0,0,1,0,0,0,0,0,0,1
19522,5,Neil Welliver,", 75, American landscape painter, mainly in his native Maine.",https://en.wikipedia.org/wiki/Neil_Welliver,10,2005,April,,,,mainly in his native Maine,,,,,,,,,75.0,,United States of America,,,2.397895,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [24]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 32146 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [25]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [26]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [27]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "football coach" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [28]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [29]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "football coach and executive" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [30]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [31]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "college football coach and administrator"]

<IPython.core.display.Javascript object>

In [32]:
# Dropping entry for relation of individual whose page link points to
index = df[df["link"] == "https://en.wikipedia.org/wiki/Lola_Wasserstein"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [33]:
# Creating lists for each category
politics_govt_law = []

arts = [
    'playwright and lyricist of "Arrivederci Roma" and other songs',
    "film and theater director and playwright",
    "Pulitzer Prize winning playwright",
    "theatre producer and playwright",
    "theatre director and playwright",
    "playwright and theatre director",
    "theatre critic and playwright",
    "film lyricist and playwright",
    "playwright and broadcaster",
    "playwright and TV producer",
    "dramatist and playwright",
    "filmmaker and playwright",
    "playwright and dramatist",
    "playwright and memoirist",
    "playwright and humorist",
    "lyricist and playwright",
    "playwright and lyricist",
    "comedian and playwright",
    "playwright and director",
    "Assiniboine playwright",
    "playwright and theatre",
    "playwright and critic",
    "playwright and",
    "and playwright",
    "playwright",
]
sports = [
    "three time Super Bowl winning football coach of the San Francisco ers and member of the Pro Football Hall of Fame",
    "football coach and member of the Pro Football Hall of Fame",
    "Hall of Fame football coach and administrator",
    "college football coach and athletic director",
    "football coach at Amherst College for years",
    "football coach and athletic administrator",
    "college football coach and administrator",
    "National Football League football coach",
    "high school and college football coach",
    "college and high school football coach",
    "Northwestern University football coach",
    "head football coach at Ithaca College",
    "football coach for Indiana University",
    "college football coach and NFL player",
    "football coach and athletic director",
    "Hall of Fame college football coach",
    "football coach and administrator",
    "former national football coach",
    "football coach and team owner",
    "football coach and executive",
    "Hall of Fame football coach",
    "NCAA and CFL football coach",
    "high school football coach",
    "football coach and manager",
    "association football coach",
    "football coach and player",
    "football coach of West y",
    "gridiron football coach",
    "college football coach",
    "East football coach",
    "football coach and",
    "and football coach",
    "football coach",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [34]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [35]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 31.4 s
Wall time: 31.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
19880,19,Dave Carr,", 48, English footballer.","https://en.wikipedia.org/wiki/Dave_Carr_(footballer,_born_1957)",3,2005,June,,,,,,,,,,,,,48.0,,United Kingdom of Great Britain and Northern Ireland,,,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1
10980,13,Jumbo Tsuruta,", 49, Japanese professional wrestler known as liver transplant, complications from .",https://en.wikipedia.org/wiki/Jumbo_Tsuruta,22,2000,May,,,,complications from,,,,,,,,,49.0,complications from liver transplant,Japan,,,3.135494,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [36]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 31674 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [37]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [38]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [39]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "archaeologist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [40]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [41]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "osteologist" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [42]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "nuclear archaeologist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [43]:
# Creating lists for each category
politics_govt_law = [
    "civil servant and United Nations disarmament official",
    "colonial civil servant and governor Ruanda Urundi",
    "Coast Salish tribal leader and civil servant",
    "civil servant and mental health campaigner",
    "civil servant and countryside campaigner",
    "civil servant and presidential advisor",
    "civil servant and policy advisor",
    "and international civil servant",
    "civil servant and whistleblower",
    "Channel Islander civil servant",
    "civil servant from Providence",
    "civil servant and magistrate",
    "prosecutor and civil servant",
    "civil servant and bureaucrat",
    "international civil servant",
    "barrister and civil servant",
    "life peer and civil servant",
    "civil servant and planner",
    "civil servant and public",
    "civil servant and peer",
    "and civil servant",
    "civil servant and",
    "civil servant",
]

arts = []
sports = []
sciences = [
    "osteologist",
]

business_farming = []
academia_humanities = [
    "museum curator and archaeologist who was director of the Museum of",
    "archaeologist specializing in Etruscan civilization and art",
    "archaeologist at the University of Sheffield",
    "Middle Eastern archaeologist and professor",
    "archaeologist and art preservationist",
    "classical scholar and archaeologist",
    "archaeologist and classical scholar",
    "archaeologist at the University of",
    "archaeologist of sites in Anatolia",
    "art conservator and archaeologist",
    "Coptic scholar and archaeologist",
    "anthropologist and archaeologist",
    "archaeologist and anthropologist",
    "archaeologist and Byzantinist",
    "archaeologist and numismatist",
    "archaeologist and philologist",
    "archaeologist and classicist",
    "archaeologist and sinologist",
    "archaeologist and professor",
    "classical archaeologist and",
    "archaeologist and curator",
    "archaeologist and scholar",
    "historical archaeologist",
    "underwater archaeologist",
    "classical archaeologist",
    "landscape archaeologist",
    "nuclear archaeologist",
    "Meso archaeologist",
    "zooarchaeologist",
    "archaeologist and",
    "and archaeologist",
    "archaeologist",
]
law_enf_military_operator = [
    "recruiter of spies",
]
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [44]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [45]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['academia_humanities'] ==1].sample(2)

CPU times: total: 33.4 s
Wall time: 33.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
1319,12,J. I. M. Stewart,", 88, Scottish novelist and academic.",https://en.wikipedia.org/wiki/J._I._M._Stewart,7,1994,November,,,,,,,,,,,,,88.0,,Scotland,,,2.079442,0,0,0,1,0,1,0,0,0,0,0,0,2
12403,9,Mitsuo Kagawa,", 78, Japanese archaeologist.",https://en.wikipedia.org/wiki/Mitsuo_Kagawa,3,2001,March,,,,,,,,,,,,,78.0,,Japan,,,1.386294,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [46]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 31260 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [47]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [48]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [49]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "racing driver" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [50]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [51]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "racing driver and executive coach"]

<IPython.core.display.Javascript object>

In [52]:
# Hard-coding cause_of_death for entry with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Roy_Barraclough"].index
df.loc[index, "cause_of_death"] = "short illness"

# Hard-coding engineer for entry for correct categorization
index = df[df["link"] == "https://en.wikipedia.org/wiki/Archie_Butterworth"].index
df.loc[index, "info_3"] = "engineer"

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [53]:
# Creating lists for each category
politics_govt_law = []

arts = [
    'comedian dubbed "Uncle Miltie" and "Mr Television"',
    "Grammy winning comedian and JFK impersonator",
    "comedian and reality television contestant",
    "comedian and classical music satirist",
    "comedian and television personality",
    "comedian and television presenter",
    "music hall performer and comedian",
    "television producer and comedian",
    "theater director and comedian",
    "comedian respiratory problems",
    "radio presenter and comedian",
    "television host and comedian",
    "comedian and television host",
    "Oneida Mohawk Cree comedian",
    "comedian and opera director",
    "comedian and game show host",
    "comedian and impressionist",
    "ventriloquist and comedian",
    "TV celebrity and comedian",
    "comedian and entertainer",
    "entertainer and comedian",
    "broadcaster and comedian",
    "improvisational comedian",
    "Wiere Brothers comedian",
    "comedian and radio host",
    "comedian and satirist",
    "Borscht Belt comedian",
    "comedian and pianist",
    "ice skating comedian",
    "comedian and dancer",
    "xiangsheng comedian",
    "East comedian",
    "film comedian",
    "and comedian",
    "comedian and",
    "comedian",
]
sports = [
    "racing driver and last living participant in the first F World Championship race",
    "Grand Prix motorcycle road racer and racing driver",
    "former Grand Prix racing driver and constructor",
    "racing driver and Formula One team principal",
    "racing driver and motorsport administrator",
    "Hall of Fame racing driver and instructor",
    "record setting harness racing driver",
    "motor racing driver and team leader",
    "racing driver and racing team owner",
    "Hall of Fame harness racing driver",
    "racing driver and car constructor",
    "racing driver and team manager",
    "racing driver and rally driver",
    "racing driver and instructor",
    "racing driver and team owner",
    "Hall of Fame racing driver",
    "racing driver and designer",
    "Indianapolis racing driver",
    "professional racing driver",
    "and amateur racing driver",
    "Formula One racing driver",
    "Grand Prix racing driver",
    "open wheel racing driver",
    "sprint car racing driver",
    "stock car racing driver",
    "off road racing driver",
    "racing driver champion",
    "amateur racing driver",
    "motor racing driver",
    "auto racing driver",
    "drag racing driver",
    "VLN racing driver",
    "racing driver and",
    "and racing driver",
    "racing driver",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [54]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [55]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 47.8 s
Wall time: 47.9 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
12767,17,Murray Murdoch,", 96, Canadian ice hockey player and coach.",https://en.wikipedia.org/wiki/Murray_Murdoch,3,2001,May,,,,,,,,,,,,,96.0,,Canada,,,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1
15463,9,Eric Martin,", 33, American racing driver, racing accident.",https://en.wikipedia.org/wiki/Eric_Martin_(racing_driver),3,2002,October,,,,racing accident,,,,,,,,,33.0,,United States of America,,,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [56]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 30808 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [57]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [58]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [59]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "art" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [60]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [61]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "martial" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [62]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "art instructor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [63]:
# Creating lists for each category
politics_govt_law = [
    "first female major party candidate to run for Governor of North Carolina",
    "Principal Speaker of Green Party of and and Oxford City Councillor",
    "who trained Martin Luther King Jr and other civil rights leaders",
    "chairman of the Communist Party of and International Brigades",
    "presidential candidate for the Alliance for Democracy party",
    "threatened by McCarthyism and championed by Edward R Murrow",
    "Democratic Party member of the House of Representatives",
    "labor leader and parliamentarian for the Peronist party",  # before arts
    "convert and partner of nationalist Subhas Chandra Bose",
    "assistant secretary of the Department of Commerce and",
    "trade unionist and member of the Communist Party of",
    "presidential candidate for the Libertarian Party",
    "leader of the Islamic Renaissance Party of istan",
    "labor leader and chairman of the Communist Party",
    "first female chair of Michigan Republican Party",
    "peer and proponent of the Hollow Earth concept",
    "Secretary of Treasury from under Jimmy Carter",
    "Party student leader and later a neo figure",
    "political leader of the Communist Party of",
    "Press Secretary for President Jimmy Carter",
    "and romantic partner to Princess Margaret",
    "leader of the Progressive Unionist Party",
    "Green Party member of the House of Lords",
    "leader of the Black Panther Party",
    "leader of Beijing Communist Party",
    "member of the Bonaparte dynasty",
    "and anti apartheid campaigner",
    "State Department official and",
    "Conservative Party chairman",
    "Black Panther Party member",
    "Republican party official",
    "Communist party secretary",
    "anti apartheid campaigner",
    "anti communist partisan",
    "revolutionary thinker",
    "Yukon Party Leader",
    "Labour Party MP",
    "Communist Party",
    "party official",
    "partisan and",
    "partisan",
]

arts = [
    "last living member of the Carter Family country music group",
    "former bass player for Tom Petty and the Heartbreakers",
    "violinist and founder of the Amadeus Quartet",
    "animator for Betty Boop and Superman cartoons",
    "daredevil and flat Earth conspiracy theorist",
    "art collector and friend of Pablo Picasso",
    "film art director and production designer",
    "film production designer and art director",
    "arts administrator and opera director",
    "television producer and art collector",
    "art dealer and wife of Marcel Duchamp",
    "abstract expressionist art collector",
    "production designer and art director",
    "art director and production designer",
    "music critic and arts administrator",
    "Academy Awards winning art director",
    "bassist for the Modern Jazz Quartet",
    "arts administrator and music critic",
    "fashion designer and art collector",
    "filmmaker and pioneer of video art",
    "film set designer and art director",
    "art director and costume designer",
    "graphic designer and art director",
    "art publisher and magazine editor",
    "costume designer and art director",
    "horticulturalist and arts patron",
    "art critic and opera librettist",
    "art collector and gallery owner",
    "art dealer and gallery director",
    "Yup'ik artisan and craftswoman",
    "reality television participant",
    "art director ; designer of the",
    "essayist and visual art critic",
    "who specialized in Coptic art",
    "art director and set designer",
    "set designer and art director",
    "Māori performing arts leader",
    "performing arts producer and",
    "psychiatric patient and art",
    "advertiser and art director",
    "art critic and radio host",
    "art director and designer",
    "media executive and arts",
    "art collector and patron",
    "stylist and art director",
    "film fight choreographer",
    "international art dealer",
    "theatre director and art",
    "art  Picasso biographer",
    "art and antiques dealer",
    "advocate of Native arts",
    "art theatre personality",
    "literary editor and art",
    "of art and art history",
    "and patron of the arts",
    "heiress and art patron",
    "watercolorist and art",
    "art  gallery director",
    "artisanal cheesemaker",
    "supporter of the arts",
    "modern art collector",
    "fine arts collector",
    "born art dealer and",
    "Republic art critic",
    "film and art critic",
    "biographer and art",
    "art critic and art",
    "patron of the arts",
    "arts administrator",
    "art gallery owner",
    "art collector and",
    "and art collector",
    "and arts patron",
    "art  art critic",
    "arts campaigner",
    "and art critic",
    "indigenous art",
    "art critic and",
    "art patron and",
    "art dealer and",
    "arts executive",
    "art  collector",
    "arts advocate",
    "art gallerist",
    "art director",
    "art designer",
    "art heiress",
    "arts leader",
    "art dealer",
    "arts patron",
    "art critic",
    "art patron",
    "art expert",
    "bartender",
    "and arts",
    "of arts",
    "artisan",
    "arts",
    "and art",
    "art and",
    "art",
]
sports = [
    "baseball memorabilia collector and limited partner for the Yankees",  # before arts
    "quarterback for UCLA Bruins football and the Baltimore Colts",
    "mixed martial arts fighter and UFC middleweight champion",
    "NASCAR stock car driver and partial team owner",
    "Hall of Fame Quarter Horse equestrian",
    "Football League quarterback and coach",
    "wrestler; patriarch of Hart wrestling",
    "football quarterback and halfback",
    "part owner of the Atlanta Braves",
    "mixed martial arts competitor",
    "partially quadriplegic sailor",
    "mixed martial arts fighter",
    "mixed martial arts trainer",
    "Hall of Fame darts player",
    "mixed martial art fighter",
    "baseball team part owner",
    "football quarterback and",
    "martial arts grandmaster",
    "martial arts master and",
    "martial arts instructor",
    "sports team part owner",
    "and Rangers part owner",
    "football quarterback",
    "martial arts teacher",
    "martial arts master",
    "martial arts expert",
    "darts administrator",
    "woman darts player",
    "darts player",
    "martial arts",
    "martial",
    "darts",
]
sciences = [
    "cardiothoracic surgeon who refined John Gibbon heart lung bypass machine",
    "software designer and a pioneer in artificial intelligence and robotics",
    "geomorphologist and expert on the geology of Earth and Mars",
    "doctor and inventor of the Dodrill GMR heart machine",
    "independent researcher in artificial intelligence",
    "professor of geology and earthquake expert",
    "surgeon and open heart surgery pioneer",
    "researcher in artificial intelligence",
    "astronomer at Swarthmore College",
    "geographer and cartographer",
    "cartographer and geographer",
    "heart surgeon and inventor",
    "oceanographic cartographer",  # before arts
    "pioneering heart surgeon",
    "paediatric heart surgeon",
    "and cartographer",
    "cartographer and",
    "heart surgeon",
    "cartographer",
]

business_farming = [
    "businesswoman; chairperson of the Clerys department store",
    "billionaire department store and hotel owner",
    "and son of Wal Mart founder Sam Walton",
    "widow of Wal Mart founder Sam Walton",
    "transportation and parts executive",
    "industrial parts executive",
    "owner of Dart Drugs Chain",  # before arts
    "and chartered accountant",
    "auto parts executive",
    "chartered accountant",
    "CEO of Earthlink",
]
academia_humanities = [
    "professor at George Mason University",
    "and professor of Asian art history",  # before arts
    "art conservator",
]
law_enf_military_operator = [
    "World War I soldier and the last surviving soldier to have taken part in the Christmas truce of",
    "Deputy Director of the Department of State Office of Security",
    "police commissioner of the Philadelphia Police Department",
    "Marine Corps sergeant and recipient of nine purple hearts",  # before arts
    "World War II Jewish partisan fighter and anti avenger",
    "leader of the Bielski partisans during World War II",
    "Secretary of the Navy in the Carter administration",
    "Wehrmacht artillery and General Staff officer",
    "Navy Chief Quartermaster and aquanaut",
    "and World War II partisan fighter",
    "partisan and liaison officer",
    "partisan during World War II",
    "Sparta Battalion commander",
    "World War II partisan and",
    "wartime firefighter",
    "wartime codebreaker",
    "and quartermaster",
    "wartime commander",
    "Jewish partisan",
    "wartime spy",
    "Partisan",
    "partisan",
]
spiritual = [
    "priest who participated in the exorcism on which was based",
    "Anglican priest and Dean of Hobart from to",
    "young earth creationist leader",  # before arts
    "young Earth creationist",
    "Young Earth creationist",
    "Carthusian monk",
    "flat earther",
]
social = []
crime = [
    "suspected drug lord and high ranking leader of the Sinaloa Cartel",
    "convict and assassin of Martin Luther King Jr",
    "drug trafficker for the Sinaloa Cartel",
    "who assassinated official Wilhelm Kube",
    "narco and member of the Cali Cartel",
    "drug lord and Sinaloa Cartel leader",
    "reputed organized crime figure and",
    "member of the Detroit Partnership",  # before arts
    "part of Watergate scandal",
    "Neturei Karta rabbi and",
    "convicted art thief",
    "drug cartel leader",
    "cartel leader",
]
event_record_other = [
    "heart and lung patient whose wrong transplant made headlines",
    "recipient of the first artificial heart transplant",  # before arts
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [64]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
    "arts": arts,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [65]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 2min 20s
Wall time: 2min 20s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
32512,17,Sergio Di Stefano,", 71, Italian actor and voice actor, heart attack.",https://en.wikipedia.org/wiki/Sergio_Di_Stefano,3,2010,September,,,,heart attack,,,,,,,,,71.0,,Italy,,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1
30456,2,Rosa Lobato de Faria,", 77, Portuguese writer and actress, complications from anemia.",https://en.wikipedia.org/wiki/Rosa_Lobato_de_Faria,6,2010,February,,,,complications from anemia,,,,,,,,,77.0,,Portugal,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [66]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 30451 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [67]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [68]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [69]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "entrepreneur" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [70]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [71]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "Internet" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [72]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "entrepreneur and ceremonial officer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [73]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = []
sciences = []

business_farming = [
    "entrepreneur and one of the founders of commercial TV broadcasting in the",
    "entrepreneur and the founder of Peet Coffee & Tea",
    "entrepreneur and joint founder of Ryanair",
    "entrepreneur and founder of The Body Shop",
    "entrepreneur and pioneer of microlending",
    "entrepreneur and real estate developer",
    "entrepreneur and founder of Herbalife",
    "industrial and financial entrepreneur",
    "entrepreneur and ceremonial officer",
    "real estate broker and entrepreneur",
    "entrepreneur and president of Fiat",
    "entrepreneur and business magnate",
    "fashion retailer and entrepreneur",
    "entrepreneur and founder of Naza",
    "angel investor and entrepreneur",
    "billionaire mining entrepreneur",
    "entrepreneur and industrialist",
    "businesswoman and entrepreneur",
    "entrepreneur and co founder of",
    "entrepreneur and billionaire",
    "opinion polling entrepreneur",
    "oil and mining entrepreneur",
    "entrepreneur and winemaker",
    "entrepreneur and executive",
    "cocktail bar entrepreneur",
    "computer entrepreneur and",
    "real estate entrepreneur",
    "marketing entrepreneur",
    "financial entrepreneur",
    "mini golf entrepreneur",
    "entrepreneur in Darwin",
    "property entrepreneur",
    "aviation entrepreneur",
    "tourism entrepreneur",
    "Arabian entrepreneur",
    "travel entrepreneur",
    "timber entrepreneur",
    "serial entrepreneur",
    "wine entrepreneur",
    "ATM entrepreneur",
    "and entrepreneur",
    "entrepreneur and",
    "entrepreneur",
]
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = [
    "wrongfully convicted of murder",
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [74]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [75]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['business_farming'] ==1].sample(2)

CPU times: total: 23.4 s
Wall time: 23.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
34406,21,"Jess Stonestreet Jackson, Jr.",", 81, American wine entrepreneur, founder of Kendall-Jackson, cancer.","https://en.wikipedia.org/wiki/Jess_Stonestreet_Jackson,_Jr.",18,2011,April,,,,founder of Kendall Jackson,cancer,,,,,,,,81.0,,United States of America,,,2.944439,0,0,0,0,1,0,0,0,0,0,0,0,1
76745,16,Sir Donald Gosling,", 90, British vice admiral and businessman, Chairman of National Car Parks .",https://en.wikipedia.org/wiki/Donald_Gosling,15,2019,September,,,vice admiral,Chairman of National Car Parks,,,,,,,,,90.0,,United Kingdom of Great Britain and Northern Ireland,,1959 1998,2.772589,0,0,0,0,1,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [76]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 30254 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [77]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [78]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [79]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "soldier" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [80]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [81]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "child soldier"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [82]:
# Creating lists for each category
politics_govt_law = [
    "Her Majesty Representative at Ascot",
]

arts = []
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = [
    "soldier during World War II and recipient of the Victoria Cross",
    "soldier and member of a Special Forces Airborne Reserve Unit",
    "Army soldier awarded the Medal of Honor during World War II",
    "Army World War II soldier who received the Medal of Honor",
    "soldier and Medal of Honor recipient for actions in War",
    "World War II soldier portrayed by Donnie Wahlberg in",
    "World War II soldier and Legion of honour recipient",
    "Army soldier who was killed by the Taliban in istan",
    "Army soldier and a recipient of the Medal of Honor",
    "Army soldier and recipient of the Medal of Honor",
    "World War II Waffen SS soldier and Panzer ace",
    "Army soldier and recipient the Medal of Honor",
    "soldier and recipient of the Medal of Honor",
    "soldier and recipient of the Victoria Cross",
    "SAS soldier involved in the Embassy Siege",
    "Army soldier and Medal of Honor recipient",
    "World War II soldier and prisoner of war",
    "soldier and Medal of Honor recipient",
    "soldier and Victoria Cross recipient",
    "Army soldier during World War II",
    "soldier and World War I veteran",
    "soldier and airborne mechanic",
    "soldier serving in the War",
    "soldier and member of the",
    "Nagorno Karabakh soldier",
    "soldier and game warden",
    "soldier during the War",
    "Army infantry soldier",
    "soldier and military",
    "World War II soldier",
    "soldier in the Army",
    "Danzig soldier and",
    "Navy SEAL soldier",
    "volunteer soldier",
    "decorated soldier",
    "Army and soldier",
    "Army soldier and",
    "combat soldier",
    "Gurkha soldier",
    "rebel soldier",
    "child soldier",
    "WWII soldier",
    "army soldier",
    "Army soldier",
    "soldier and",
    "and soldier",
    "soldier",
]
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [83]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [84]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 26.2 s
Wall time: 26.2 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
58351,16,Lino Toffolo,", 81, Italian actor and singer.",https://en.wikipedia.org/wiki/Lino_Toffolo,4,2016,May,",",,,,,,,,,,,,81.0,,Italy,,",",1.609438,0,0,0,0,0,1,0,0,0,0,0,0,1
23209,17,Leyly Matine-Daftary,", 70, Iranian artist.",https://en.wikipedia.org/wiki/Leyly_Matine-Daftary,9,2007,April,,,,,,,,,,,,,70.0,,Iran,,,2.302585,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [85]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 30044 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [86]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [87]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [88]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "conductor" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [89]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [90]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [91]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [92]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "conductor and music administrator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [93]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "music director and conductor of the Nashville Symphony Orchestra",
    "conductor and founder of the Philharmonia Virtuosi",
    "choral conductor and Baroque music specialist",
    "Grammy Award winning conductor and director",
    "conductor and director of the City Opera",
    "conductor and founder of the Boys Choir",
    "music director and choral conductor",
    "choir conductor and choral editor",
    "conductor and music administrator",
    "orchestral and opera conductor",
    "conductor and musical director",
    "choral conductor and teacher",
    "harpsichordist and conductor",
    "operatic tenor and conductor",
    "conductor of classical music",
    "symphony orchestra conductor",
    "music producer and conductor",
    "conductor and music director",
    "chorus master and conductor",
    "jazz conductor and arranger",
    "choral conductor and music",
    "conductor and viola player",
    "conductor of popular music",
    "conductor and accordionist",
    "conductor and broadcaster",
    "horn player and conductor",
    "clarinetist and conductor",
    "violinist and conductor",
    "conductor and violinist",
    "timpanist and conductor",
    "flautist and conductor",
    "organist and conductor",
    "baritone and conductor",
    "flutist and conductor",
    "conductor and violist",
    "violist and conductor",
    "cellist and conductor",
    "pianist and conductor",
    "conductor and cellist",
    "conductor and pianist",
    "orchestral conductor",
    "orchestra conductor",
    "and choir conductor",
    "classical conductor",
    "conductor  musicals",
    "conductor and music",
    "symphony conductor",
    "musical conductor",
    "choral conductor",
    "opera conductor",
    "music conductor",
    "and conductor",
    "conductor and",
    "conductor",
]
sports = []
sciences = [
    "pioneer in the semiconductor industry",
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [94]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [95]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 38 s
Wall time: 38 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
37472,20,S. N. Lakshmi,", 85, Indian actress, cardiac arrest.",https://en.wikipedia.org/wiki/S._N._Lakshmi,7,2012,February,,,,cardiac arrest,,,,,,,,,85.0,,India,,,2.079442,0,0,0,0,0,1,0,0,0,0,0,0,1
85151,8,Marc Metdepenningen,", 62, Belgian journalist.",https://en.wikipedia.org/wiki/Marc_Metdepenningen,7,2020,November,,,,,,,,,,,,,62.0,,Belgium,,,2.079442,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [96]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 29791 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [97]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [98]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [99]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "pianist" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [100]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [101]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "director of music academies" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [102]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "pianist and director of music academies"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [103]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "music theorist and pianist specializing in Bach",
    "jazz pianist who played with Benny Goodman",
    "jazz pianist and record company executive",
    "Jewish pianist portrayed in the film",
    "concert pianist and record producer",
    "pianist and musical stage performer",
    "pianist and Grammy winning arranger",
    "festival promoter and jazz pianist",
    "free jazz pianist and vibraphonist",
    "jazz and R&B pianist and organist",
    "jazz pianist and radio presenter",
    "East Coast blues and R&B pianist",
    "harpsichordist and fortepianist",
    "jazz pianist and music arranger",
    "jazz pianist and accordionist",
    "cafe pianist and entertainer",
    "Hall of Fame country pianist",
    "jazz pianist and bandleader",
    "jazz bandleader and pianist",
    "pianist and opera director",
    "Moravian classical pianist",
    "blues pianist and organist",
    "pianist and harpsichordist",
    "jazz pianist and arranger",
    "jazz pianist and organist",
    "pianist for Frank Sinatra",
    "jazz pianist and vocalist",
    "pianist and vocal coach",
    "blues and swing pianist",
    "pianist and accompanist",
    "bandleader and pianist",
    "pianist and bandleader",
    "swing era jazz pianist",
    "blues and jazz pianist",
    "puppeteer and pianist",
    "post bop jazz pianist",
    "boogie woogie pianist",
    "popular music pianist",
    "pianist and organist",
    "classical pianist and",
    "pianist and arranger",
    "pianist and vocalist",
    "and concert pianist",
    "pianist and critic",
    "bebop jazz pianist",
    "free jazz pianist",
    "pianist and piano",
    "jazz funk pianist",
    "pianist and music",
    "classical pianist",
    "jazz pianist and",
    "concert pianist",
    "music  pianist",
    "tango pianist",
    "music pianist",
    "blues pianist",
    "rock pianist",
    "jazz pianist",
    "R&B pianist",
    "pianist and",
    "and pianist",
    "pianist",
]
sports = [
    "Olympic champion at the Summer Olympics",
]
sciences = []

business_farming = []
academia_humanities = [
    "director of music academies",
]
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

In [104]:
# Dropping entry with link that points to page for husband of entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Sylvia_Straus"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [105]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [106]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 42 s
Wall time: 42.1 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
10027,26,Hoyt Axton,", 61, American folk music singer-songwriter and actor, heart attack.",https://en.wikipedia.org/wiki/Hoyt_Axton,16,1999,October,,,,heart attack,,,,,,,,,61.0,,United States of America,,,2.833213,0,0,0,0,0,1,0,0,0,0,0,0,1
59495,8,Ali Baba,", 76, Pakistani writer.",https://en.wikipedia.org/wiki/Ali_Baba_(Sindhi_writer),5,2016,August,,,,,,,,,,,,,76.0,,Pakistan,,,1.791759,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [107]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 29417 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [108]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [109]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [110]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "tennis player" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [111]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [112]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [113]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [114]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "tennis player and administrator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [115]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "wheelchair table tennis player and Paralympian gold medalist",
    "Paralympic wheelchair tennis player and powerlifter",
    "table tennis player and assistant racehorse trainer",
    "wheelchair tennis player and Paralympic athlete",
    "tennis player and sports administrator",
    "Male international table tennis player",
    "handball and tennis player and coach",
    "tennis player and administrator",
    "table tennis and tennis player",
    "table tennis player and coach",
    "tennis player and instructor",
    "tennis player in the s and s",
    "badminton and tennis player",
    "Olympic table tennis player",
    "Hall of Fame tennis player",
    "Female tennis player from",
    "hockey and tennis player",
    "wheelchair tennis player",
    "squash and tennis player",
    "tennis player and sports",
    "tennis player and coach",
    "chess and tennis player",
    "female tennis player",
    "table tennis player",
    "tennis player and",
    "and tennis player",
    "tennis player",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [116]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [117]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 15.8 s
Wall time: 15.8 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
28163,19,Doc Blanchard,", 84, American college football player , pneumonia.",https://en.wikipedia.org/wiki/Doc_Blanchard,19,2009,April,"Army, Heisman Trophy winner",,,pneumonia,,,,,,,,,84.0,,United States of America,,"Army, Heisman Trophy winner 1945",2.995732,0,0,0,0,0,0,1,0,0,0,0,0,1
8867,7,Al Phillips,", 79, English featherweight/lightweight boxer.",https://en.wikipedia.org/wiki/Al_Phillips,10,1999,February,,,,,,,,,,,,,79.0,,United Kingdom of Great Britain and Northern Ireland,,,2.397895,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [118]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 29163 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [119]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [120]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [121]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "convicted murderer" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [122]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [123]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "Marine veteran of the War" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [124]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "convicted murderer and botched execution survivor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [125]:
# Creating lists for each category
politics_govt_law = [
    "Corsican nationalist",
]

arts = []
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = [
    "Marine veteran of the War",
]
spiritual = []
social = []
crime = [
    "and convicted murderer executed by hanging at dawn in Changi Prison for manipulating and hiring a minor who",
    "convicted murderer and suspected serial killer",
    "convicted murderer and prison escapee",
    "terrorist and convicted murderer",
    "gangster and convicted murderer",
    "convicted murderer and fugitive",
    "and convicted murderer",
    "convicted murderer and",
    "convicted murderer",
]
event_record_other = [
    "botched execution survivor",
]
other_species = []

<IPython.core.display.Javascript object>

In [126]:
# Hard-coding cause_of_death for entry with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Anthony_Ler"].index
df.loc[index, "cause_of_death"] = "executed"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [127]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [128]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['crime'] ==1].sample(2)

CPU times: total: 7.8 s
Wall time: 7.81 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
20518,3,Melvin White,", 55, American convicted murderer, executed in Texas.",https://en.wikipedia.org/wiki/Melvin_White_(murderer),3,2005,November,,,,executed in,,,,,,,,,55.0,,United States of America,,,1.386294,0,0,0,0,0,0,0,0,0,1,0,0,1
71446,30,Whitey Bulger,", 89, American gangster and convicted murderer, beaten.",https://en.wikipedia.org/wiki/Whitey_Bulger,135,2018,October,Winter Hill Gang,,,beaten,,,,,,,,,89.0,,United States of America,,Winter Hill Gang,4.912655,0,0,0,0,0,0,0,0,0,1,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [129]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 29000 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [130]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [131]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [132]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "general" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [133]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [134]:
# # # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "director general of" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [135]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "volleyball player and general"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [136]:
# Creating lists for each category
politics_govt_law = [
    "general secretary of the National Union of",
    "involved in the Algiers putsch of",
    "senator and attorney general",
    "Governor general of the",
    "attorney general",  # before law_enf_military_operator
    "prefect and",
]

arts = [
    "founder and general manager of Montreux Jazz Festival",
    "general manager of the Metropolitan Opera",
    "director general of",  # before law_enf_military_operator
]
sports = [
    "executive and general manager of the National Basketball Association Sacramento Kings",
    "president and general manager of the Dallas Cowboys professional football team",  # before law_enf_military_operator
    "basketball coach and secretary general of the Basketball Association of the",
    "general manager of the Pittsburgh Pirates and Baltimore Orioles",
    "former Baseball general manager with the Baltimore Orioles",
    "former general manager of Montreal Canadiens",
    "general manager for the Galaxy soccer team",
    "baseball executive and general manager",
    "baseball Hall of Fame general manager",
    "ice hockey coach and general manager",
    "ice hockey general manager",
    "general secretary of FIFA",
    "baseball general manager",
]
sciences = [
    "general topologist",  # before law_enf_military_operator
    "general practitioner",
]

business_farming = []
academia_humanities = [
    "Teachers since",
]
law_enf_military_operator = [
    "Air Force four star general and commander of the World War II Tuskegee Airmen",
    "army lieutenant general and presidential military advisor",
    "lieutenant general in the Air Force and WW II flying ace",
    "Air Force general and commander of Tactical Air Command",
    "Air Force general and recipient of the Medal of Honor",
    "Air Force major general and double Hero of the Union",
    "general and commander of the nd Airborne Division",
    "bomber and Wehrmacht general during World War II",
    "lieutenant general in the People Liberation Army",
    "Air Force general and World War II flying ace",
    "army general Adjutant General to the Forces",
    "Army Air Forces general during World War II",
    "flying ace and general during World War II",
    "general in the National Revolutionary Army",
    "Air Force brigadier general and test pilot",
    "flying ace during World War II and general",
    "nazi Wehrmacht general during World War II",
    "brigadier general and RAF radio operator",
    "Air Force officer and brigadier general",
    "former head of general security in Gaza",
    "lieutenant general in the Marine Corps",
    "general and commander in the Viet Cong",
    "general and commander of the air force",
    "general in the People Liberation Army",
    "brigadier general in the Marine Corps",
    "Waffen SS general during World War II",
    "Army general and son of George Patton",
    "Air Force general and aviator pioneer",
    "Wehrmacht general during World War II",
    "army general and intelligence chief",
    "army general Commander of the Army",
    "brigadier general in the Air Force",
    "general and military junta leader",
    "major general in the Marine Corps",
    "general and commander of the Army",
    "Army general and Defense Minister",
    "general for South during the War",
    "Air National Guard major general",
    "Marine Corps lieutenant general",
    "People Liberation Army general",
    "major general in the Air Force",
    "SS general during World War II",
    "resistance fighter and general",
    "Republican Guard major general",
    "Marine Corps four star general",
    "Marine Corps brigadier general",
    "general in the People Army of",
    "Nationalist military general",
    "general during the dirty war",
    "air force lieutenant general",
    "Air Force lieutenant general",
    "Army general during the War",
    "military lieutenant general",
    "Air Force four star general",
    "general during World War II",
    "major general and astronaut",
    "air force brigadier general",
    "Marine Corps major general",
    "Illinois inspector general",
    "lieutenant general and spy",
    "general and navy commander",
    "pilot and airforce general",
    "general in the y Wehrmacht",
    "and a general of the Army",
    "Air Force general officer",
    "marines brigadier general",
    "army general and military",
    "major general in the Army",
    "general in the Air Force",
    "Air Force major general",
    "army lieutenant general",
    "Army lieutenant general",
    "USMC brigadier general",
    "army brigadier general",
    "Army brigadier general",
    "lieutenant general and",
    "airforce major general",
    "and a brigade general",
    "the Air Force general",
    "army colonel general",
    "general and military",
    "retired army general",
    "Marine Corps general",
    "army major general",
    "Army major general",
    "Bundeswehr general",
    "lieutenant general",
    "air force general",
    "Air Force general",
    "brigadier general",
    "major general and",
    "inspector general",
    "ieutenant general",
    "military general",
    "Army general and",
    "army general and",
    "general officer",
    "Colonel general",
    "colonel general",
    "police general",
    "general in the",
    "major general",
    "South general",
    "Stasi general",
    "ARVN general",
    "Serb general",
    "USMC general",
    "Army general",
    "army general",
    "general and",
    "and general",
    "general",
]
spiritual = [
    "Church of the Nazarene minister and general superintendent",  # before law_enf_military_operator
]
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [137]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "law_enf_military_operator": law_enf_military_operator,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [138]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['law_enf_military_operator'] ==1].sample(2)

CPU times: total: 1min 18s
Wall time: 1min 18s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
28728,4,Béla Király,", 97, Hungarian general and historian, natural causes.",https://en.wikipedia.org/wiki/B%C3%A9la_Kir%C3%A1ly,15,2009,July,,,,natural causes,,,,,,,,,97.0,,Hungary,,,2.772589,0,0,0,1,0,0,0,1,0,0,0,0,2
3626,28,San Yu,", 77, Burmese general, politician, and president of Myanmar.",https://en.wikipedia.org/wiki/San_Yu,9,1996,January,,,,politician,and president of,,,,,,,,77.0,,Burma,,,2.302585,0,0,0,0,0,0,0,1,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [139]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 28469 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean9.db]()

In [140]:
# Exporting dataframe

# Saving dataset in a SQLite database
conn = sql.connect("wp_life_expect_clean9.db")
df.to_sql("wp_life_expect_clean9", conn, index=False)

# Chime notification when cell executes
chime.success()

<IPython.core.display.Javascript object>

# [Proceed to Data Cleaning Part 10]()