# Wikipedia Notable Life Expectancies
# [Notebook  9: Data Cleaning Part 8](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean8_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean7.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean7", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98059 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98057,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98058,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
67232,23,Robert Dowdell,", 85, American actor .",https://en.wikipedia.org/wiki/Robert_Dowdell,4,2018,January,",",,,,,,,,,,,,85.0,,United States of America,,",",1.609438,0,0,0,0,0,1,0,0,0,0,0,0,1
26515,24,Vice Vukov,", 72, Croatian singer and politician.",https://en.wikipedia.org/wiki/Vice_Vukov,6,2008,September,,,,,,,,,,,,,72.0,,Croatia,,,1.94591,0,0,0,0,0,1,0,0,1,0,0,0,2
54356,5,George Cole,", 90, English actor .",https://en.wikipedia.org/wiki/George_Cole_(actor),13,2015,August,", ,",,,,,,,,,,,,90.0,,United Kingdom of Great Britain and Northern Ireland,,", ,",2.639057,0,0,0,0,0,1,0,0,0,0,0,0,1
27535,28,Gyula Pálóczi,", 46, Hungarian athlete, heart disease.",https://en.wikipedia.org/wiki/Gyula_P%C3%A1l%C3%B3czi,11,2009,January,,,athlete,heart disease,,,,,,,,,46.0,,Hungary,,,2.484907,0,0,0,0,0,0,0,0,0,0,0,0,0
68481,8,Tate Adams,", 96, Northern Irish-born Australian printmaker.",https://en.wikipedia.org/wiki/Tate_Adams,8,2018,April,,,printmaker,,,,,,,,,,96.0,,United Kingdom of Great Britain and Northern Ireland,Australia,,2.197225,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98059 entries, 0 to 98058
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98059 non-null  object 
 1   name                       98059 non-null  object 
 2   info                       98059 non-null  object 
 3   link                       98059 non-null  object 
 4   num_references             98059 non-null  int64  
 5   year                       98059 non-null  int64  
 6   month                      98059 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98027 non-null  object 
 10  info_3                     48895 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "educator" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "sex educator" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [11]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [12]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "specialist in studies and educator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [13]:
# Creating lists for each category
politics_govt_law = [
    "struggle veteran",
]

arts = []
sports = []
sciences = [
    "and wonen health",
]

business_farming = []
academia_humanities = [
    "educator and scholar",
    "educator and Māori language proponent",
    "and early childhood educator",
    "educator and professor",
    "scholar and educator",
    "educator and public schools superintendent",
    "AIDS educator and",
    "AIDS educator",
    "educator and director of the Advanced Placement Program",
    "educator and librarian",
    "adult educator",
    "university educator",
    "linguist and educator",
    "librarian and educator",
    "specialist in studies and educator",
    "Native educator",
    "educator and debate coach",
    "language linguist and educator",
    "Sanskrit scholar and educator",
    "educator and musicologist",
    "educator and anthropologist",
    "teacher and educator",
    "educator and college administrator",
    "Inuit educator",
    "educator and university administrator",
    "translator and educator",
    "and educator",
    "educator and",
    "educator",
]
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [14]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [15]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['academia_humanities'] ==1].sample(2)

CPU times: total: 16 s
Wall time: 16 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
91899,8,Lila R. Gleitman,", 91, American academic .",https://en.wikipedia.org/wiki/Lila_R._Gleitman,16,2021,August,University of,,,,,,,,,,,,91.0,,United States of America,,University of Pennsylvania,2.833213,0,0,0,1,0,0,0,0,0,0,0,0,1
43854,28,F.D. Reeve,", 84, American academic and author.",https://en.wikipedia.org/wiki/F.D._Reeve,5,2013,June,,,,,,,,,,,,,84.0,,United States of America,,,1.791759,0,0,0,1,0,1,0,0,0,0,0,0,2


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [16]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 37846 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [17]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [18]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [19]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "chemist" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [20]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [21]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "houngan" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [22]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "chemist and astronaut candidate"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [23]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = []
sciences = [
    "theoretical chemist and Nobel Prize winner",
    "biochemist and molecular genetics pioneer",
    "native chemist",
    "biochemist and reproductive endocrinologist",
    "chemist and a winner of the Nobel Prize in Physics in",
    "chemist and Nobel Prize Laureate",
    "Nobel Prize winner in chemistry",
    "chemist and biologist",
    "biochemist and microbiologist",
    "nutritionist and biochemist",
    "neuroscience biochemist",
    "quantum chemist",
    "research chemist",
    "theoretical chemist and computer scientist",
    "research chemist and inventor",
    "chemist known for his work on the Manhattan Project",
    "biochemist and enzymologist",
    "scientist in the field of electrochemistry",
    "physical chemist at AT&T Bell Laboratories",
    "biochemist and Nobel Prize for Chemistry laureate",
    "industrial chemist best known for his work on polymers",
    "neurochemist and glycobiologist",
    "biochemist and virologist",
    "biochemist and recipient of the Nobel Prize in Physiology or Medicine",
    "physical and theoretical chemist",
    "chemist and mineralogist",
    "biochemist and cancer researcher",
    "biochemist and pharmacologist and recipient of the Nobel Prize in Physiology or Medicine",
    "nuclear chemist and recipient of the Nobel Prize in Chemistry",
    "chemist and co winner of Nobel Prize in Chemistry in",
    "botanist and chemist",
    "biochemist and protein crystallographer",
    "South wine chemist",
    "chemist and nuclear scientist",
    "oncologist and chemist",
    "chemist and science",
    "chemist and mass spectrometrist",
    "marine and freshwater chemist",
    "biochemist and molecular biologist",
    "protein chemist",
    "chemistry doctor",
    "Southern biochemist",
    "biochemist and developmental biologist",
    "biochemist and pharmacologist",
    "chemist and materials scientist",
    "biological chemist",
    "synthetic organic chemist",
    "atmospheric chemist",
    "chemist and researcher",
    "environmental chemist",
    "geochemist and planetary scientist",
    "physiologist and biochemist",
    "chemist and crystallographer",
    "scientist and agrochemist",
    "biochemist and physiologist",
    "biochemist and medical researcher",
    "textile chemist",
    "geologist and geochemist",
    "Congress chemist",
    "polymer chemist and inventor",
    "polymer chemist",
    "metallurgist and physical chemist",
    "chemist and astronaut candidate",
    "chemist and statistician",
    "cytologist and biochemist",
    "pharmacologist and biochemist",
    "natural product chemist",
    "chemist and Nobel laureate",
    "biochemist and Nobel Prize laureate",
    "biochemist and geneticist",
    "soil chemist and",
    "soil chemist",
    "biochemist and nutritionist",
    "pharmaceutical chemist",
    "Nobel Prize winning biochemist",
    "pharmacologist and chemist",
    "chemist and Nobel Prize laureate",
    "chemist and inventor",
    "biophysical chemist",
    "analytical chemist",
    "nuclear chemist",
    "electrochemist",
    "theoretical chemist",
    "inorganic chemist",
    "Nobel Prize winning chemist",
    "geochemist and",
    "geochemist",
    "physical chemist",
    "of organic chemistry and a",
    "of organic chemistry",
    "organic chemist and",
    "organic chemist",
    "of biochemistry",
    "and biochemist",
    "biochemist and",
    "biochemist",
    "of chemistry",
    "chemistry",
    "food chemist and",
    "chemist and",
    "and chemist",
    "chemist",
]

business_farming = []
academia_humanities = [
    "the first Rector Magnificus of the University of Twente",
]
law_enf_military_operator = []
spiritual = [
    "houngan",
]
social = []
crime = [
    "underground LSD chemist and",  # before sciences
]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [24]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "sciences": sciences,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [25]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 53.2 s
Wall time: 53.3 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
91034,30,Arthur M. Poskanzer,", 90, American experimental physicist.",https://en.wikipedia.org/wiki/Arthur_M._Poskanzer,7,2021,June,,,,,,,,,,,,,90.0,,United States of America,,,2.079442,1,0,0,0,0,0,0,0,0,0,0,0,1
70772,12,Shen Chun-shan,", 86, Taiwanese physicist and academic, President of National Tsing Hua University , ruptured intestine.",https://en.wikipedia.org/wiki/Shen_Chun-shan,25,2018,September,,,,President of National Tsing Hua University,ruptured intestine,,,,,,,,86.0,,Taiwan,,1994 1997,3.258097,1,0,0,1,0,0,0,0,0,0,0,0,2


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [26]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 37258 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [27]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [28]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [29]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "rugby union player" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [30]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [31]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "rugby union player and administrator" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [32]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [33]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "rugby union player and administrator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [34]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "rugby union player and World War II fighter",
    "rugby union player and TV",
    "former rugby union player for and Neath RFC",
    "All Blacks rugby union player",
    "rugby league and rugby union player",
    "rugby union player and president of the Rugby Union",
    "rugby union player and national team captain",
    "boxer and rugby union player",
    "Hall of Fame rugby union player and coach",
    "rugby union player and selector",
    "rugby union player and manager",
    "rugby union player and administrator",
    "rugby union player and coach",
    "international rugby union player",
    "rugby union player for",
    "rugby union player and",
    "and rugby union player",
    "rugby union player",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [35]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [36]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 9.64 s
Wall time: 9.65 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
46444,20,Pete Titanic,", 93, Canadian football player .",https://en.wikipedia.org/wiki/Pete_Titanic,3,2014,January,Toronto Argonauts,,,,,,,,,,,,93.0,,Canada,,Toronto Argonauts,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1
72336,30,Joan Kaufman,", 83, American baseball player .",https://en.wikipedia.org/wiki/Joan_Kaufman,4,2018,December,All Girls Professional Baseball League,,,,,,,,,,,,83.0,,United States of America,,All American Girls Professional Baseball League,1.609438,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [37]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36986 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [38]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [39]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [40]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "rugby league player" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [41]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [42]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "rugby league player and administrator" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [43]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "rugby league player involved in match fixing scandal"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [44]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "Hall of Fame rugby league player",
    "Papua New rugby league player",
    "rugby league player and captain",
    "rugby league player and international coach",
    "Hall of Fame rugby league player and national team captain",
    "football and rugby league player",
    "rugby league player for Wigan and Great",
    "rugby league player for Great and Hull KR",
    "rugby league player for Great",
    "rugby league player and referee",
    "rugby league player and administrator",
    "rugby union and rugby league player",
    "rugby league player and coach",
    "and rugby league player",
    "rugby league player and",
    "rugby league player",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = ["involved in match fixing scandal"]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [45]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [46]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 9.53 s
Wall time: 9.52 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
36732,27,Mykola Koltsov,", 75, Russian-born Ukrainian footballer and youth trainer.",https://en.wikipedia.org/wiki/Mykola_Koltsov,3,2011,December,,,,,,,,,,,,,75.0,,Russia,Ukraine,,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1
9913,26,Malky MacDonald,", 85, Scottish football player and manager.",https://en.wikipedia.org/wiki/Malky_MacDonald,28,1999,September,,,,,,,,,,,,,85.0,,Scotland,,,3.367296,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [47]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36706 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [48]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [49]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [50]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "sociologist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [51]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [52]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [index for index in df.index if "historical sociologist" in df.loc[index, "info"]]
# ]

<IPython.core.display.Javascript object>

In [53]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "sociologist and critic of systems analysis"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [54]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "caver",
]
sciences = [
    "sociologist and communication scientist",
    "sociologist and social justice",
    "sociologist and social",
    "sociologist and psychiatrist",
    "sociologist and critic of systems analysis",
    "sociologist of science",
    "criminologist and sociologist",
    "gerontologist and sociologist",
    "urban sociologist",
    "sociologist and patients' rights advocate",
    "sociologist and statistician",
    "environmental sociologist",
    "sociologist and demographer",
    "sociologist and criminologist",
    "sociologist from",
    "sociologist and",
    "sociologist",
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [55]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [56]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 10.7 s
Wall time: 10.7 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
12290,14,Maurice Levitas,", 84, Irish-born British sociologist.",https://en.wikipedia.org/wiki/Maurice_Levitas,5,2001,February,,,,,,,,,,,,,84.0,,Ireland,United Kingdom of Great Britain and Northern Ireland,,1.791759,1,0,0,0,0,0,0,0,0,0,0,0,1
2493,25,Ernest Walton,", 91, Irish physicist.",https://en.wikipedia.org/wiki/Ernest_Walton,30,1995,June,,,,,,,,,,,,,91.0,,Ireland,,,3.433987,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [57]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36474 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [58]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [59]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [60]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "psychologist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [61]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [62]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "and pastoral psychologist" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [63]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [64]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "educational psychologist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [65]:
# Creating lists for each category
politics_govt_law = ["children ombudsman"]

arts = []
sports = []
sciences = [
    "clinical psychologist and researcher",
    "psychologist renowned for his critical studies of hypnosis",
    "psychologist and expert in the work of Alfred Adler",
    "psychologist and sex researcher at Johns Hopkins University",
    "psychologist and methodologist",
    "psychologist known for his pioneering work in autism treatment",
    "psychologist and women reproductive health",
    "traffic psychologist",
    "psychologist and researcher",
    "psychologist and former head of the Psychological Association",
    "correctional psychologist and criminologist",
    "behavioral geneticist and psychologist",
    "psychologist and psychoanalyst",
    "psychologist at University",
    "existential psychologist",
    "psychologist who specialized in developmental psychology",
    "psychologist and a pioneer in experimental psychology",
    "child development psychologist",
    "development psychologist",
    "psychologist and statistician",
    "pediatric psychologist",
    "psychologist from the Antilles",
    "social psychologist and sexologist",
    "psychologist and sleep researcher",
    "psychologist and computer science researcher",
    "social psychologist and criminologist",
    "parapsychologist and ufologist",
    "mathematical psychologist and",
    "mathematical psychologist",
    "engineering psychologist",
    "psychiatrist and parapsychologist",
    "psychologist and anti divorce",
    "evolutionary psychologist",
    "sexologist and psychologist",
    "systems psychologist",
    "environmental psychologist",
    "psychologist and self help",
    "and pastoral psychologist",
    "organizational psychologist",
    "neuroscientist and psychologist",
    "child psychologist and",
    "child psychologist",
    "psychologist and parapsychologist",
    "experimental psychologist",
    "psychologist and neuroscientist",
    "parapsychologist and",
    "and parapsychologist",
    "parapsychologist",
    "neuropsychologist",
    "cognitive psychologist",
    "developmental psychologist",
    "clinical psychologist",
    "social psychologist",
    "behavioral psychologist",
    "Jungian psychologist and",
    "and psychologist",
    "psychologist and",
    "psychologist",
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [66]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [67]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 33.2 s
Wall time: 33.2 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
83617,16,Nina McClelland,", 90, American chemist.",https://en.wikipedia.org/wiki/Nina_McClelland,26,2020,August,,,,,,,,,,,,,90.0,,United States of America,,,3.295837,1,0,0,0,0,0,0,0,0,0,0,0,1
65315,16,Sven Oluf Sørensen,", 96, Norwegian physicist.",https://en.wikipedia.org/wiki/Sven_Oluf_S%C3%B8rensen,3,2017,September,,,,,,,,,,,,,96.0,,Norway,,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [68]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36147 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [69]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [70]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [71]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "engineer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [72]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [73]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "sound engineer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [74]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [75]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "recording engineer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [76]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "Grammy Award winning sound engineer and music producer",
    "sound engineer and record producer",  # before sciences
    "car design engineer",
    "recording engineer and producer",
    "recording engineer and record producer",
    "Academy Award winning sound engineer",
    "sound engineer and founder of the BBC Radiophonic Workshop",
    "recording engineer and studio owner",
    "audio engineer and producer",
    "music producer and engineer",
    "record producer and audio engineer",
    "audio engineer and video game developer",
    "audio mastering engineer",
    "recording engineer and record label owner",
    "film recording engineer",
    "Grammy award winning music engineer",
    "record producer and recording engineer",
    "music engineer and producer",
    "and recording engineer",
    "film sound engineer",
    "audio engineer and record producer",
    "sound engineer",
    "recording engineer",
]
sports = []
sciences = [
    "electrical and radio engineer",
    "engineer and statistician",
    "electrical engineer and co founder",
    "videogame console engineer",
    "engineer and glider pilot",
    'engineer nicknamed the "Father of the Corvette "',
    "mechanical engineer and inventor",
    "integrated circuit engineer",
    "chemical engineer in unleaded gasoline",
    "computer hardware engineer",
    "electronics engineer for Sony",
    "diving engineer",
    "executive engineer",
    "electrical engineer and university",
    "electrical engineer and scientist",
    "and rocket engineer",
    "nuclear scientist and chemical engineer",
    "welding engineer",
    "scientist and aerospace engineer",
    "ceramic engineer",
    "chemical engineer and safety consultant",
    "audio engineer and electronics engineer",
    "chemical engineer and pharmaceutical executive",
    "mining engineer and geologist",
    "consulting engineer",
    "engineer and geotechnician",
    "engineering seismologist",
    "engineer in charge during the Chernobyl disaster",
    "pioneering computer engineer",
    "spacecraft engineer",
    "engineer and bullet train pioneer",
    "engineer and optician",
    "aircraft automotive engineer",
    "helicopter and aerospace engineer",
    "engineer and electronic computing pioneer",
    "rocket engineer who worked at the Jet Propulsion Laboratory",
    "aerospace engineer and a pioneer in helicopter design",
    "ice road engineer",
    "rail engineer",
    "engineer and aerospace executive",
    "engineer and co designer of NASA Apollo Lunar Module",
    "aerospace engineer and member of the",
    "communication satellites engineer",
    "aeronautics and astronautic engineer",
    "biomedical engineering pioneer",
    "traffic engineer and inventor of the mini roundabout",
    "chemical engineer for the Procter & Gamble company",
    "aeronautical engineer who invented the tandem rotor placement in helicopter design",
    "traffic engineer",
    "Toyota engineer",
    "scientist and electrical engineer",
    "inventor and chemical engineer",
    "engineer and cryptographer",
    "computer engineer and scientist",
    "aeronautical engineer and space scientist",
    "engineer and R&D executive",
    "civil engineer and industrial",
    "thermal engineer",
    "automotive executive and engineer",
    "geologist and earthquake engineer",
    "metallurgist and chemical engineer",
    "engineer and food scientist",
    "naval electronics engineer",
    "control systems engineer",
    "automotive engineer and executive",
    "ship engineer",
    "mechanical engineer and product",
    "explosives engineer and inventor",
    "aerospace engineer and fluid dynamicist",
    "nuclear material engineer",
    "plant breeding engineer",
    "marine engineer and executive",
    "industrial engineer and cycling",
    "video game engineer",
    "materials scientist and engineer",
    "engineering manager",
    "agronomist and engineer",
    "engineer and control theorist",
    "naval engineer and",
    "construction executive and civil engineer",
    "agricultural engineer and",
    "refrigeration engineer",
    "chemical engineer and scientist",
    "engineer and scientist",
    "ice drilling engineer",
    "ornithologist and engineer",
    "motorsport engineer",
    "computer engineer and NASA official",
    "mechanical engineer and anti nuclear power",
    "bridge structural engineer",
    "aircraft engineer",
    "soil mechanics engineer",
    "microwave electronics and communications engineer",
    "camera engineer",
    "engineer and rocket scientist",
    "NASA engineer and",
    "engineer and the second Director of Lockheed Skunk Works from to",
    "aeronautics engineer",
    "earthquake engineer",
    "computational engineer",
    "engineer and chief executive",
    "computer engineer and systems scientist",
    "engineer and inventor of the hovercraft",
    "railway signal engineer",
    "and railway engineer",
    "railway engineer",
    "optoelectronic engineer",
    "materials engineer",
    "auto engineer",
    "Hall of Fame computer engineer",
    "flight engineer",
    "airplane designer and engineer",
    "and race chassis engineer",
    "consulting nuclear engineer",
    "engineering construction executive",
    "engineering scientist",
    "systems engineer",
    "aeroelasticity engineer and aircraft designer",
    "engineer and technology executive",
    "biochemical engineer",
    "particle accelerator specialist and engineer",
    "hydrogeologist and environmental engineer",
    "port and harbor engineer",
    "rocket engineer",
    "aerospace engineer and NASA manager",
    "engineer and aircraft designer",
    "chemical engineer and inventor",
    "electrical engineer and inventor",
    "metallurgical engineer",
    "of aerospace engineering",
    "and aerospace engineer",
    "space engineer",
    "electronic engineer",
    "design engineer",
    "NASA engineer",
    "naval engineer",
    "petroleum engineer",
    "motorcycle engineer",
    "satellite engineer",
    "engineer and racing car designer",
    "biomedical engineer",
    "environmental engineer",
    "agricultural engineer",
    "irrigation engineer",
    "engineer and computer scientist",
    "industrial engineer",
    "bioengineer",
    "scientist and engineer",
    "acoustical engineer",
    "mining engineer and",
    "mining engineer",
    "aviation engineer",
    "hydraulic engineer",
    "software engineer",
    "audio engineer and inventor",
    "audio engineer and",
    "audio engineer",
    "telecommunications engineer",
    "electronics engineer and",
    "electronics engineer",
    "nuclear engineer",
    "computer engineer",
    "and automotive engineer",
    "automotive engineer",
    "structural engineer and",
    "structural engineer",
    "mechanical engineer and",
    "mechanical engineering",
    "mechanical engineer",
    "and aeronautical engineer",
    "aeronautical engineer",
    "aerospace engineer",
    "and chemical engineer",
    "chemical engineer and",
    "chemical engineer",
    "civil engineering",
    "and civil engineer",
    "civil engineer and",
    "civil engineer",
    "of electrical engineering",
    "electrical engineer and",
    "electrical engineer",
    "engineer and inventor",
    "and aircraft engineer",
    "engineer and space",
    "and test engineer",
    "and engineer for NASA",
    "transportation engineer and",
    "optical engineer",
    "inventor and engineer",
    "and cycling engineer",
    "broadcast engineer",
    "and aeroplane engineer",
    "Electrical engineering",
    "paper engineer and",
    "of engineering",
    "engineering",
    "and engineer",
    "engineer and",
    "engineer",
]

business_farming = ["co founder of Matchbox Toys"]
academia_humanities = []
law_enf_military_operator = ["Deputy Secretary of Defense"]
spiritual = []
social = []
crime = []
event_record_other = ["taken hostage in"]
other_species = []

<IPython.core.display.Javascript object>

In [77]:
# Hard-coding values to "sound engineer" which will correctly categorize entries in arts
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Bruce_Jackson_(audio_engineer)"
].index
df.loc[index, "info_2"] = "sound engineer"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Seth_Firkins"].index
df.loc[index, "info_2"] = "sound engineer"

<IPython.core.display.Javascript object>

In [78]:
# Hard-coding values to "audio engineer" which will correctly categorize entries in sciences
index = df[df["link"] == "https://en.wikipedia.org/wiki/Alan_R._Pearlman"].index
df.loc[index, "info_2"] = "audio engineer"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Bruno_Vanryb"].index
df.loc[index, "info_2"] = "audio engineer"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Dave_Smith_(engineer)"].index
df.loc[index, "info_2"] = "audio engineer"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [79]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "arts": arts,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [80]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 1min 56s
Wall time: 1min 56s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
78494,2,Edward Spiegel,", 88, American physicist.",https://en.wikipedia.org/wiki/Edward_Spiegel,3,2020,January,,,,,,,,,,,,,88.0,,United States of America,,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1
42699,1,Karen Muir,", 60, South African swimmer and physician, breast cancer.",https://en.wikipedia.org/wiki/Karen_Muir,10,2013,April,youngest sporting world record holder,,swimmer,breast cancer,,,,,,,,,60.0,,South Africa,,youngest sporting world record holder,2.397895,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [81]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 35348 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [83]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [444]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [443]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "philanthropist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [442]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [445]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "heiress and philanthropist" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [446]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "philanthropist and matriarch of the Bronfman family"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [447]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = []
sciences = ["otorhinolaryngologist"]

business_farming = []
academia_humanities = [
    "shoe museum curator",
]
law_enf_military_operator = [
    "ATS volunteer",
]
spiritual = []
social = [
    "and philanthropist in West New Province",
    "Building Society founder and philanthropist",
    "philanthropist and widow of Tektronix founder Howard Vollum",
    "philanthropist and member of the Rothschild family",
    "philanthropist and matriarch of the Bronfman family",
    "social change philanthropist",
    'philanthropist known in Kansas City as "Secret Santa"',
    "philanthropist and wife of Charles Bronfman",
    "Ford Motor Company heiress and prominent philanthropist",
    "philanthropist and heiress to Mellon family fortune"
    "philanthropist and wife of industrialist Charles W Engelhard Jr",
    "philanthropist; widow of McDonald founder Ray Kroc",
    "philanthropist and socialite",
    "welfare worker and philanthropist",
    "riveter and philanthropist",
    "philanthropist and child welfare advocate",
    "philanthropist and social worker",
    "social worker and philanthropist",
    "community advocate and philanthropist",
    "heiress and philanthropist",
    "socialite and philanthropist",
    "and philanthropist",
    "philanthropist and",
    "philanthropist",
]
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [448]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [449]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['social'] ==1].sample(2)

CPU times: total: 13.5 s
Wall time: 13.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
35156,11,Andy Barker,", 87, American philanthropist.",https://en.wikipedia.org/wiki/Andy_Barker_(philanthropist),6,2011,July,,,,,,,,,,,,,87.0,,United States of America,,,1.94591,0,1,0,0,0,0,0,0,0,0,0,0,1
94822,24,Shirley Bottolfsen,", 87, Irish philanthropist.",https://en.wikipedia.org/wiki/Shirley_Bottolfsen,16,2021,December,,,,,,,,,,,,,87.0,,Ireland,,,2.833213,0,1,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [450]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 35058 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [554]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [552]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [551]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[[index for index in df.index if "boxer" in df.loc[index, "info"]], "info_2",]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [550]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [553]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "bodyguard for boxer Kostya Tszyu" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [547]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "beatboxer",  # before sports
]
sports = [
    "undefeated former WBA super featherweight and WBC lightweight champion boxer",
    "former world welterweight and super welterweight champion boxer",
    "Olympic boxer and former world light heavyweight champion",
    "boxer and WBA featherweight champion from March through May",
    "former WBC and WBA World lightweight champion boxer",
    "former WBC world light welterweight champion boxer",
    "boxer who won the Empire super featherweight title",
    "WBO flyweight and light flyweight champion boxer",
    "former world super featherweight champion boxer",
    "born former Empire heavyweight champion boxer",
    "former NBA world bantamweight champion boxer",
    "former WBC light welterweight champion boxer",
    "former world light welterweight champion boxer",
    "WBA and WBC super lightweight champion boxer",
    "boxer and World Light Heavyweight Champion",
    "champion professional boxer in the s and s",
    "professional and Olympic lightweight boxer",
    "world light middleweight champion boxer",
    "Olympic silver medalist lightweight boxer",
    "former NABF lightweight champion boxer",
    "former WBC heavyweight champion boxer",
    "former WBA heavyweight champion boxer",
    "NABF super lightweight champion boxer",
    "Hall of Fame light heavyweight boxer",
    "former triple world champion boxer",
    "former heavyweight boxing champion",
    "Olympic silver medal winning boxer",
    "Olympic bronze medal winning boxer",
    "former lightweight champion boxer",
    "Olympic gold medal winning boxer",
    "Hall of Fame featherweight boxer",
    "featherweight lightweight boxer",
    "Olympic light heavyweight boxer",
    "boxer and heavyweight champion",
    "fly bantam featherweight boxer",
    "Olympic silver medalist boxer",
    "WBA flyweight champion boxer",
    "bantamweight champion boxer",
    "heavyweight champion boxer",
    "Olympic heavyweight boxer",
    "boxer and Olympic wrestler",
    "Olympic bantamweight boxer",
    "Olympic middleweight boxer",
    "light heavyweight boxer and",
    "dual world champion boxer",
    "light middleweight boxer",
    "boxer and boxing trainer",
    "light welterweight boxer",
    "light heavyweight boxer",
    "Olympic flyweight boxer",
    "Hall of Fame boxer and",
    "boxer and rugby player",
    "Olympic medalist boxer",
    "and boxer and Olympian",
    "heavyweight kickboxer",
    "world champion boxer",
    "featherweight boxer",
    "Virgin Island boxer",
    "Golden Gloves boxer",
    "cruiserweight boxer",
    "boxer and kickboxer",
    "professional boxer",
    "boxer and Olympian",
    "middleweight boxer",
    "bantamweight boxer",
    "welterweight boxer",
    "Hall of Fame boxer",
    "heavyweight boxer",
    "boxer and trainer",
    "lightweight boxer",
    "boxer and manager",
    "unlicensed boxer",
    "Olympic boxer []",
    "flyweight boxer",
    "Muay kickboxer",
    "Olympic boxer",
    "and kickboxer",
    "kickboxer and",
    "amateur boxer",
    "Serb boxer",
    "kickboxer",
    "champion boxer",
    "era boxer",
    "boxer and",
    "and boxer",
    "boxer",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = [
    "bodyguard for boxer Kostya Tszyu",  # before sports
]
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [548]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [555]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 46.4 s
Wall time: 46.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
40229,11,Seamus Bonner,", 63, Irish Gaelic footballer , short illness.",https://en.wikipedia.org/wiki/Seamus_Bonner,8,2012,October,Donegal,,,short illness,,,,,,,,,63.0,,Ireland,Europe,Donegal,2.197225,0,0,0,0,0,0,1,0,0,0,0,0,1
44800,12,Frank Tripucka,", 85, American football player , heart failure.",https://en.wikipedia.org/wiki/Frank_Tripucka,15,2013,September,"Denver Broncos, Detroit Lions",,,heart failure,,,,,,,,,85.0,,United States of America,,"Denver Broncos, Detroit Lions",2.772589,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [556]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 34663 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [813]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [812]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [811]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "scientist" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [810]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [809]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "scientist and administrator" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [808]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "farmer scientist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [806]:
# Creating lists for each category
politics_govt_law = [
    "political scientist and nonviolence advocate",
    "political scientist and watcher",
    "political scientist and legal",
    "political scientist and",
    "and political scientist",
    "political scientist",  # before sciences
    "Ostforschung",
]

arts = []
sports = []
sciences = [
    "scientist whose pioneering measurements showed a carbon dioxide buildup in the earth atmosphere",
    "Manhattan Project scientist and former director of Oak Ridge National Laboratory",
    "computer scientist and chief designer of the Ada programming language",
    "scientist and recipient of the Nobel Prize in Physiology or Medicine",
    "computer scientist who led the IBM team that developed Fortran",
    "cognitive scientist and pioneer in artificial intelligence",
    "pioneering scientist in the field of human consciousness",
    "scientist whose work helped develop prion theory",
    "nuclear scientist and father of nuclear program",
    "research scientist and influenza vaccine expert",
    "scientist credited with inventing the wetsuit",
    "marine biologist and environmental scientist",
    "scientist and X ray crystallography pioneer",
    "food technologist and nutritional scientist",
    "scientist and organ transplantation pioneer",
    'and social scientist who coined the term ""',
    "computer and information research scientist",
    "computer scientist and Turing Award winner",
    "scientist who co founded the Club of Rome",
    "neuroscientist and evolutionary biologist",
    "scientist and former Bell Labs president",
    "marine biologist and fisheries scientist",
    "agricultural scientist and plant breeder",
    "atmospheric scientist and meteorologist",
    "computer scientist and internet pioneer",
    "scientist and Down syndrome researcher",
    "ichthyologist and fisheries scientist",
    "computer scientist at Rice University",
    "psychological and cognitive scientist",
    "paleontologist and forensic scientist",
    "neuroscientist and autism researcher",
    "agricultural scientist and ecologist",
    "computer scientist and microcomputer",
    "ecologist and conservation scientist",
    "biologist and agricultural scientist",
    "scientist and co founder of the JPL",
    "computer scientist and statistician",
    "rocket scientist and NASA executive",
    "statistician and computer scientist",
    "scientist and Nobel Prize laureate",
    "food scientist and microbiologist",
    "scientist and diabetes researcher",
    "scientist and computer programmer",
    "computer scientist and astronomer",
    "geologist and planetary scientist",
    "atmospheric and climate scientist",
    "computer scientist and programmer",
    "Hall of Fame computer scientist",
    "scientist and earthquake expert",
    "inventor and forensic scientist",
    "neuroscientist and physiologist",
    "psychiatrist and neuroscientist",
    "computer scientist and inventor",
    "computer scientist and pioneer",
    "environmental health scientist",
    "interdisciplanarily scientist",
    "scientist and endocrinologist",
    "doctor and medical scientist",
    "computer scientist from UCSD",
    "computational neuroscientist",
    "geologist and soil scientist",
    "and climate change scientist",
    "scientist and ISRO chairman",
    "scientist and administrator",
    "medical research scientist",
    "missile control scientist",
    "earth scientist and polar",
    "cancer research scientist",
    "behavioral neuroscientist",
    "pulp and paper scientist",
    "cognitive neuroscientist",
    "pharmaceutical scientist",
    "horticultural scientist",
    "communication scientist",
    "public health scientist",
    "environmental scientist",
    "scientist and inventor",
    "conservation scientist",
    "high voltage scientist",
    "research scientist and",
    "Tatar rocket scientist",
    "computer scientist and",
    "and computer scientist",
    "IBM computer scientist",
    "biomaterials scientist",
    "and forensic scientist",
    "inventor and scientist",
    "aeronautical scientist",
    "agricultural scientist",
    "surgeon and scientist",
    "and nuclear scientist",
    "atmospheric scientist",
    "information scientist",
    "social scientist and",
    "and social scientist",
    "doctor and scientist",
    "biomedical scientist",
    "and rocket scientist",
    "veterinary scientist",
    "nutrition scientist",
    "stem cell scientist",
    "fuel cell scientist",
    "polar scientist and",
    "plant scientist and",
    "fisheries scientist",
    "molecular scientist",
    "cognitive scientist",
    "planetary scientist",
    "materials scientist",
    "computer scientist",
    "wildlife scientist",
    "neuroscientist and",
    "research scientist",
    "and neuroscientist",
    "medical  scientist",
    "cultural scientist",
    "material scientist",
    "medical scientist",
    "climate scientist",
    "nuclear scientist",
    "rocket scientist",
    "geoscientist and",
    "forest scientist",
    "animal scientist",
    "marine scientist",
    "social scientist",
    "polar scientist",
    "radio scientist",
    "neuro scientist",
    "space scientist",
    "rice scientist",
    "soil scientist",
    "food scientist",
    "neuroscientist",
    "geoscientist",
    "scientist and",
    "and scientist",
    "scientist",
]

business_farming = []
academia_humanities = [
    "literary scientist",  # before sciences
    "media scientist",
]
law_enf_military_operator = ["weapons expert"]
spiritual = []
social = []
crime = []
event_record_other = ["Unabomber target"]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [807]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "sciences": sciences,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [814]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 1min 18s
Wall time: 1min 18s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
46673,4,Alfred S. Yue,", 95, American engineer and professor emeritus.",https://en.wikipedia.org/wiki/Alfred_S._Yue,8,2014,February,,,professor emeritus,,,,,,,,,,95.0,,United States of America,,,2.197225,1,0,0,0,0,0,0,0,0,0,0,0,1
22109,27,William Horwitz,", 88, American chemist.",https://en.wikipedia.org/wiki/William_Horwitz,3,2006,September,,,,,,,,,,,,,88.0,,United States of America,,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [815]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 33798 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [817]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [819]:
# Code to check each value
roles_list.pop()

'activist'

<IPython.core.display.Javascript object>

In [961]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [index for index in df.index if "activist" in df.loc[index, "info"]], "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [962]:
specific_roles_list.index("programmer and internet activist")

585

<IPython.core.display.Javascript object>

In [963]:
specific_roles_list = specific_roles_list[:589]

<IPython.core.display.Javascript object>

In [1509]:
# Code to check each specific value
specific_roles_list.pop()

'community activist and Holocaust survivor'

<IPython.core.display.Javascript object>

In [1510]:
# Example code to quick-screen values that may overlap categories
df.loc[
    [index for index in df.index if "community activist and" in df.loc[index, "info"]]
]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
42888,15,Sal Castro,", 79, American community activist and teacher, thyroid cancer.",https://en.wikipedia.org/wiki/Sal_Castro,4,2013,April,,,community activist and teacher,thyroid cancer,,,,,,,,,79.0,,United States of America,,,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0
65672,13,Betty Campbell,", 82, Welsh community activist and head teacher.",https://en.wikipedia.org/wiki/Betty_Campbell,21,2017,October,,,community activist and head teacher,,,,,,,,,,82.0,,Wales,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
80712,9,Dame Jocelyn Barrow,", 90, British educator, community activist and politician.",https://en.wikipedia.org/wiki/Jocelyn_Barrow,23,2020,April,,,,community activist and politician,,,,,,,,,90.0,,United Kingdom of Great Britain and Northern Ireland,,,3.178054,0,0,0,1,0,0,0,0,0,0,0,0,1
91633,27,Rudi Leavor,", 95, German-born British community activist and Holocaust survivor.",https://en.wikipedia.org/wiki/Rudi_Leavor,9,2021,July,,,community activist and Holocaust survivor,,,,,,,,,,95.0,,Germany,United Kingdom of Great Britain and Northern Ireland,,2.302585,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [1475]:
# Example code to quick-check a specific entry
df[df["info_2"] == "Islamic revolutionist and activist"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
61062,5,Geydar Dzhemal,", 69, Russian Islamic revolutionist and activist.",https://en.wikipedia.org/wiki/Geydar_Dzhemal,10,2016,December,,,Islamic revolutionist and activist,,,,,,,,,,69.0,,Russia,,,2.397895,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = [
    "civil rights activist and member of the Citizens' Commission to Investigate the FBI"
    'World War II conscientious objector and peace activist with War Resisters League',
    'of Birmingham and environmental activist with Friends of the Earth',
    'UNDP human rights activist and Amnesty International board member',
    'civil rights activist and Socialist candidate for President in',
    'labor union activist and President of the United Auto Workers',
    'former Ku Klux Klan member turned civil rights activist',
    'political activist and opposition leader in Ingushetia',
    'esthesioneuroblastoma sufferer and euthanasia activist',
    'political activist and Philadelphia City Councilwoman',
    'and co founder of the Crips turned anti gang activist',
    'elephant conservationist and anti poaching activist',
    'exonerated prisoner and anti death penalty activist',
    'Western Shoshone leader and environmental activist',
    'activist and founder of the Gray Panthers movement',
    'gay rights activist and Mattachine Society founder',
    'activist and advocate for people with disabilities',
    'internment camp detainee and civil rights activist',
    'activist and leader of the Communist Party of the',
    'activist and first wife of president Václav Havel',
    'activist and Berkeley Free Speech Movement member',
    'human rights activist in the fields of gay women',
    'human rights activist in Chechnya and Ingushetia',
    'political activist; former wife of Charles Vidor',
    'political activist and daughter of Ronald Reagan',
    'Western Shoshone indigenous rights activist and',
    'and jailed Tiananmen Square democracy activist',
    'environmentalist and anti consumerism activist',
    'revolutionary figure and women rights activist',
    'anti apartheid activist and political prisoner',
    'human rights activist and political dissident',
    'pro democracy activist and political prisoner',
    'political activist and trades union official',
    'Prime Minister of and human rights activist',
    'pro choice activist and co founder of NARAL',
    'radical feminist  anti pornography activist',
    'trade unionist and Māori language activist',
    'human rights activist and environmentalist',
    'ex Black Panthers and anti police activist',
    'atomic bomb survivor and anti war activist',
    'International Solidarity Movement activist',
    'trade unionist and environmental activist',
    'Cree elder and indigenous rights activist',
    'gay rights activist and founder of J Flag',
    'civil rights and civil liberties activist',
    'anti apartheid activist and civil servant',
    'union organizer and women rights activist',
    'white civil rights activist and lobbyist',
    'human rights activist and hunger striker',
    'equality and alternative energy activist',
    'Nisqually tribal fishing rights activist',
    'gay liberation and transgender activist',
    'attorney and disability rights activist',
    'aboriginal activist and Wiradjuri elder',
    'environmentalist and anti war activist',
    'anti apartheid and gay rights activist',
    'conservationist and political activist',
    'loyalist activist and victims advocate',
    'political lobbyist and Jewish activist',
    'women rights and LGBT rights activist',
    'civic and voter registration activist',
    'gay rights pioneer and peace activist',
    'political activist and public servant',
    'social activist and community leader',
    'neo Pétainist and political activist',
    'anarchist and Romani rights activist',
    'Congresswoman; women rights activist',
    'legislative aide and Jewish activist',
    'women rights activist and sex worker',
    'extreme right activist and Wehrmacht',
    'anti apartheid and intersex activist',
    'Cahuilla tribal leader and activist',
    'President of Fish & Game Commission',
    'Kootenai tribal leader and activist',
    'dissident and human rights activist',
    'worker and consumer rights activist',
    'community and human rights activist',
    'barrister and human rights activist',
    'anti caste discrimination activist',
    'New activist for West independence',
    'human rights activist and attorney',
    'anti apartheid activist and member',
    'civil rights activist and attorney',
    'civil rights activist and feminist',
    'Māori activist and public servant',
    'republican political activist and',
    'healthcare activist and political',
    'activist and wife of Oliver Tambo',
    'anti communist political activist',
    'civil rights and freedom activist',
    'civil rights pioneer and activist',
    'Trotskyist activist and Civil War',
    'political activist and campaigner',
    'attorney and LGBT rights activist',
    'and nuclear disarmament activist',
    'barrister and political activist',
    'Indigenous women rights activist',
    'Native and women rights activist',
    'anti role playing games activist',
    'medical cannabis rights activist',
    'activist and gay rights advocate',
    'communist and political activist',
    'political activist and life peer',
    'childhood immunizations activist',
    'Trotskyist and anti war activist',
    'peace and anti nuclear activist',
    'activist and ATSIC commissioner',
    'religious and cultural activist',
    'Equal Rights Amendment activist',
    'communist and activist of birth',
    'The anti apartheid and activist',
    'secularist and atheist activist',
    'and transgender rights activist',
    'peace and human rights activist',
    'activist and political prisoner',
    'activist in the peace movement',
    'and disability rights activist',
    'indigenous rights activist and',
    'AIDS and tuberculosis activist',
    'Communist Party labor activist',
    'independence movement activist',
    'nationalist political activist',
    'independence activist for West',
    'activist for Jewish and causes',
    'environmentalist and activist',
    'comfort women rights activist',
    'student and anti war activist',
    'activist and environmentalist',
    'social and political activist',
    'and early gay rights activist',
    'activist for human rights and',
    'and Jewish community activist',
    'and anti pornography activist',
    'activist for assisted suicide',
    'social activist and feminist',
    'reproductive health activist',
    'left wing political activist',
    'counter culture activist and',
    'and family planning activist',
    'socialist and peace activist',
    'and ex gay movement activist',
    'children television activist',
    'anti apartheid activist and',
    'and anti apartheid activist',
    'anti nuclear power activist',
    'anti communist activist and',
    'feminist and peace activist',
    'community activist and head',
    'and pro euthanasia activist',
    'and sustainability activist',
    'housing rights activist and',
    'natural childbirth activist',
    'and anti communism activist',
    'and tribal rights activist',
    'pro establishment activist',
    'cannabis and LGBT activist',
    'Anishinaabe water activist',
    'and anti abortion activist',
    'environmental activist and',
    'sex worker rights activist',
    'anti White Terror activist',
    'language movement activist',
    'and prison reform activist',
    'Takelma elder and activist',
    'Indigenous social activist',
    'and environmental activist',
    'revolutionist and activist',
    'and Romani people activist',
    "Kwakwaka'wakw activist and",
    'activist and film subject',
    'and civil rights activist',
    'Civil Rights activist and',
    'activist for independence',
    'and independence activist',
    'gay rights legal activist',
    'and women rights activist',
    'and anti nuclear activist',
    'independence activist and',
    'conservative activist and',
    'and human rights activist',
    'and trans rights activist',
    'exiled political activist',
    'and transvestite activist',
    'turned political activist',
    'football fan and activist',
    'anti immigration activist',
    'human rights activist and',
    'anti pornography activist',
    'assisted suicide activist',
    'civil rights activist and',
    'and LGBT rights activist',
    'anti government activist',
    'LGBT rights activist and',
    'palliative care activist',
    'civil liberties activist',
    'anti psychiatry activist',
    'disabled rights activist',
    'bisexual rights activist',
    'and trade union activist',
    'children rights activist',
    "workers' rights activist",
    'transgender activist and',
    'and gun rights activist',
    'white supremacy activist,
    'life extension activist',
    'anti communist activist',
    'sexual freedom activist',
    'welfare rights activist',
    'and gay rights activist',
    'anti abortion activist;',
    'East political activist',
    'anti polygraph activist',
    'and men rights activist',
    'animal welfare activist',
    'anti drug activist and',
    'victim rights activist',
    'and HIV AIDS activists',
    'Navajo Nation activist',
    'community activist and',
    'labour rights activist',
    'birth control activist',
    'political activist and',
    'and political activist',
    'Neo political activist',
    'native people activist',
    'dissident and activist',
    'activist and communist',
    'Communist activist and',
    'voting rights activist',
    'and communist activist',
    'Burkinabè activist and',
    'desegregation activist',
    'and paralysis activist',
    'native rights activist',
    'attorney and activist',
    'and maritime activist',
    'and anti war activist',
    'civil rights activist',
    'Raja freedom activist',
    'women rights activist',
    'political activist in',
    'Ogala Lakota activist',
    'Māori rights activist',
    'conservation activist',
    'prostitution activist',
    'and internet activist',
    'women health activist',
    'open housing activist',
    'anti tobacco activist',
    'anti logging activist',
    'and pro life activist',
    'and anti gay activist',
    'civil right activist',
    'and antiwar activist',
    'land rights activist',
    'drug policy activist',
    'Sikh rights activist',
    'nationalist activist',
    'trade union activist',
    'social activist and',
    'and social activist',
    'Trotskyist activist',
    'republican activist',
    'euthanasia activist',
    'Indigenous activist',
    'anti junta activist',
    'neofascist activist',
    'ti Kremlin activist',
    'aboriginal activist',
    'vegetarian activist',
    'healthcare activist',
    'rap music activist',
    'Winnebago activist',
    'Sioux activist and',
    'and civic activist',
    'women activist and',
    'and peace activist',
    'religious activist',
    'far right activist',
    "'AIDS activist and",
    'political activist',
    'and labor activist',
    'and Roma activist',
    'and BDSM activist',
    'anti gun activist',
    'and AIDS activist',
    'Native s activist',
    'cannabis activist',
    'Zainichi activist',
    'peace activist in',
    'anti War activist',
    'anti gay activist',
    'loyalist activist',
    'equality activist',
    'Chicano activist',
    'and gay activist',
    'lesbian activist',
    'Native  activist',
    'POW MIA activist',
    'JDL activist and',
    'zionist activist',
    'and HIV activist',
    'cycling activist',
    'Zionist activist',
    'Navajo activist',
    'albino activist',
    'Maoist activist',
    'Romani activist',
    'Native activist',
    'media activist',
    'NAACP activist',
    'urban activist',
    'Dalit activist',
    'peace activist',
    'union activist',
    'Tatar activist',
    'anti activist',
    'food activist',
    'BDSM activist',
    'Pan activist',
    'activist and',
    'and activist',
    'PEN activist',
]

arts = [
    'feminist activist and cultural', # before politics_govt_law
    'Internet vlogger',
    'vlogger',
]
sports = [


]
sciences = []

business_farming = [ 
]
academia_humanities = []
law_enf_military_operator = [
    'PIRA volunteer',
    'who fought in the Civil War',
    'member of The Weatherman'
]
spiritual = [
    'anti Catholic religious activist', # before politics_govt_law
    'and church activist',
    'evangelical Christian activist'
]
social = [
    'Director of Action Aid', # before politics_govt_law
    'youth activist and Order of recipient',
    'National Treasure',
    'cancer research activist',
    'AIDS and nutrition activist',
    'amyotrophic lateral sclerosis activist',
    'leader and social activist',
    'and community activist',
    'literacy activist',
    'and anti suicide activist'

]
crime = [
    'aircraft hijacker',
    'convicted bomber',
    'convicted bank robber',
]
event_record_other = [
    'kidnapping victim',
    'neo victim',
]
other_species = []

In [None]:
# Hard-coding cause_of_death and info_3 to clarify values found in info_2 for entry
index = df[df['link'] == 'https://en.wikipedia.org/wiki/Paul_Jennings_Hill'].index
df.loc[index, 'cause_of_death'] = 'executed'
df.loc[index, 'info_3']= 'murderer'

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [816]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "writer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Code to check each specific value
# specific_roles_list.pop()

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean8.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean8.db")
# df.to_sql("wp_life_expect_clean8", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 9 ]()