# Wikipedia Notable Life Expectancies
# [Notebook 12: Data Cleaning Part 11](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean11_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean10.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean10", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98056 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98054,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98055,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
30733,4,Etta Cameron,", 70, Bahamian-born Danish gospel singer, cancer.",https://en.wikipedia.org/wiki/Etta_Cameron,3,2010,March,,,,cancer,,,,,,,,,70.0,,The Bahamas,Denmark,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1
42105,18,"Alger Chapman, Jr.",", 81, American finance executive, CEO and Chairman of the Chicago Board Options Exchange , heart failure.","https://en.wikipedia.org/wiki/Alger_Chapman,_Jr.",4,2013,February,,,finance executive,CEO and Chairman of the Chicago Board Options Exchange,heart failure,,,,,,,,81.0,,United States of America,,1986 1997,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0
35254,21,Ashleigh Connor,", 21, Australian soccer player, car accident.",https://en.wikipedia.org/wiki/Ashleigh_Connor,5,2011,July,,,soccer player,car accident,,,,,,,,,21.0,,Australia,,,1.791759,0,0,0,0,0,0,0,0,0,0,0,0,0
88811,30,Josefina Cuesta,", 74, Spanish historian and academic.",https://en.wikipedia.org/wiki/Josefina_Cuesta,8,2021,March,,,,,,,,,,,,,74.0,,Spain,,,2.197225,0,0,0,1,0,0,0,0,0,0,0,0,1
52962,19,Michael J. D. Powell,", 78, British mathematician.",https://en.wikipedia.org/wiki/Michael_J._D._Powell,11,2015,April,,,,,,,,,,,,,78.0,,United Kingdom of Great Britain and Northern Ireland,,,2.484907,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98056 entries, 0 to 98055
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98056 non-null  object 
 1   name                       98056 non-null  object 
 2   info                       98056 non-null  object 
 3   link                       98056 non-null  object 
 4   num_references             98056 non-null  int64  
 5   year                       98056 non-null  int64  
 6   month                      98056 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98024 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [76]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [75]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "publisher" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [74]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [73]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "prima ballerina" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [72]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "Holocaust denial" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [71]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "Bible publisher"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [82]:
# Creating lists for each category
politics_govt_law = [
    "who fought two First Amendment battles during the s",
    "Amazon environmentalist and conservationist",
    "conservationist and environmentalist",
    "environmentalist and conservationist",
    "wildlife conservationist and",
    "wildlife conservationist",
    "animal conservationist",
    "nature conservationist",
    "marine conservationist",
    "canal conservationist",
    "and conservationist",
    "conservationist",
    "university student",  # before academia_humanities
    "royal of the Pahlavi dynasty and elder sister of the last Shah of",
    "royal and sister of Queen Elizabeth II",
    "royal prince and Communist leader",
    "father of royal divorcée Sarah",
    "member of the royal family",
    "claimant of royalty",  # also ambassador of Navy League of US
    "traditional royal",
    "Māori royal elder",
    "Holocaust denial",
    "Manchurian royal",
    "royal courtier",
    "royal consort",
    "royal steward",
    "royal servant",
    "former royal",
    "and royal",
    "royal and",
    "royalist",
    "royalty",
    "royal",
]

arts = [
    "guitarist for the heavy metal band Great White; victim of the Station nightclub fire",
    "guitarist who played with Frank Sinatra and on The Tonight Show orchestra",
    "blues guitarist who influenced Otis Redding and Jimi Hendrix",
    "guitarist and a founder of rock band Molly Hatchet",
    "bass guitarist for the punk band The Dead Milkmen",
    "guitarist for Madrugada and My Midnight Creeps",
    "rock guitarist who founded The Replacements",
    "jazz and blues rock guitarist and vocalist",
    "rockabilly guitarist and record producer",
    "post punk guitarist and record producer",
    "guitarist and founding member of Foghat",
    "guitarist with The Marshall Tucker Band",
    "jazz double bassist and bass guitarist",
    "free improvising avant garde guitarist",
    "guitarist from rock outfit Rose Tattoo",
    "hardcore punk drummer and guitarist",
    "heavy metal bassist and guitarist",
    "bluegrass guitarist and banjoist",
    "guitarist and surf music pioneer",
    "Hall of Fame bluegrass guitarist",
    "guitarist and founding member of",
    "guitarist for the Cosmic Psychos",
    "rock and country music guitarist",
    "guitarist and ethnomusicologist",
    "blues guitarist and keyboardist",
    "rhythm guitarist for Body Count",
    "rock keyboardist and guitarist",
    "rockabilly and swing guitarist",
    "guitarist and record producer",
    "Hall of Fame blues guitarist",
    "bass guitarist and vocalist",
    "lead guitarist and vocalist",
    "guitarist of rock band Ratt",
    "rhythm and blues guitarist",
    "guitarist and chess master",
    "keyboardist and guitarist",
    "electric blues guitarist",
    "Piedmont blues guitarist",
    "blues and rock guitarist",
    "guitarist and bookseller",
    "punk and metal guitarist",
    "piedmont blues guitarist",
    "rock and roll guitarist",
    "jazz and soul guitarist",
    "country music guitarist",
    "Hall of Fame guitarist",
    "guitarist and vocalist",
    "jazz and R&B guitarist",
    "guitarist and arranger",
    "inventor and guitarist",
    "bass guitarist for EMF",
    "guitarist and inventor",
    "heavy metal guitarist",
    "pedal steel guitarist",
    "bassist and guitarist",
    "Head Charge guitarist",
    "death metal guitarist",
    "blues rock guitarist",
    "rockabilly guitarist",
    "bluegrass guitarist",
    "rock bass guitarist",
    "primitive guitarist",
    "punk rock guitarist",
    "Lap steel guitarist",
    "slack key guitarist",
    "flamenco guitarist",
    "country guitarist",
    "session guitarist",
    "rhythm guitarist",
    "studio guitarist",
    "reggae guitarist",
    "blues guitarist",
    "steel guitarist",
    "metal guitarist",
    "roots guitarist",
    "bass guitarist",
    "rock guitarist",
    "folk guitarist",
    "soul guitarist",
    "guitarist",
    "drummer for the bands Trouble and Zoetrope",
    "Hall of Fame record producer and drummer",
    "drummer and co founder of the band Space",
    "drummer for rock band Jefferson Airplane",
    "drummer for rock band Doobie Brothers",
    "drummer for rock band Bleed the Dream",
    "drummer for the glam rock band Sweet",
    "drummer with s soft rock band Bread",
    "Hall of Fame drummer and lyricist",
    "hard rock and heavy metal drummer",
    "drummer for glam rock band Mud",
    "jazz drummer and bandleader on",
    "jazz drummer and vibraphonist",
    "Cheyenne drummer and vocalist",
    "one time drummer for The Who",
    "drummer and record producer",
    "drummer with The Searchers",
    "Hall of Fame rock drummer",
    "drummer and percussionist",
    "rhythm and blues drummer",
    "session drummer for ABBA",
    "avant garde jazz drummer",
    "former Shadows drummer",
    "funk and jazz drummer",
    "hardcore punk drummer",
    "Hall of Fame drummer",
    "jazz and R&B drummer",
    "drummer and vocalist",
    "swing jazz drummer",
    "free jazz drummer",
    "drummer; recorded",
    "jazz pop drummer",
    "New Wave drummer",
    "session drummer",
    "reggae drummer",
    "studio drummer",
    "former drummer",
    "blues drummer",
    "metal drummer",
    "Vodou drummer",
    "swing drummer",
    "jazz drummer",
    "rock drummer",
    "drummer",
    "animator for the Walt Disney Company",
    "animator and television producer",
    "animator and animation director",
    "animation director and producer",
    "Academy Award winning animator",
    "animator and a director",
    "and animation director",
    "animator and director",
    "animator and producer",
    "stop motion animator",
    "background animator",
    "animation director",
    "Disney animator",
    "animator and",
    "and animator",
    "animator",
    "prima ballerina and",
    "and prima ballerina",
    "prima ballerina",
    "comic book publisher and comic book character creator",
    "publisher of auto industry and enthusiast magazines",
    "newspaper executive and publisher of from to",
    "former publisher of the Lewy body disease",
    "theater producer and newspaper publisher",
    "fashion magazine publisher and editor",
    "businesswoman and newspaper publisher",
    "science fiction editor and publisher",
    "record producer and music publisher",
    "pornographic magazine publisher and",
    "newspaper executive and publisher",
    "comic book producer and publisher",
    "science fiction fanzine publisher",
    "editor and publisher of magazine",
    "comic book editor and publisher",
    "newspaper editor and publisher",
    "newspaper publisher and editor",
    "cookbook editor and publisher",
    "magazine publisher and editor",
    "publisher and chairman of the",
    "music publisher and executive",
    "game designer and publisher",
    "music manager and publisher",
    "publisher and pornographer",
    "book editor and publisher",
    "science fiction publisher",
    "publisher and pamphleteer",
    "and numismatic publisher",
    "rock magazine publisher",
    "newspaper publisher and",
    "publisher and producer",
    "independent publisher",
    "publisher and editor",
    "editor and publisher",
    "comic book publisher",
    "newspaper publisher",
    "magazine publisher",
    "literary publisher",
    "cookbook publisher",
    "website publisher",
    "music publisher",
    "media publisher",
    "manga publisher",
    "Bible publisher",
    "book publisher",
    "publisher and",
    "and publisher",
    "publisher of",
    "publisher",
]
sports = [
    "canoe maker",
    "international rugby player",
    "rugby player and executive",
    "wrestler and rugby player",
    "cricket and rugby player",
    "wheelchair rugby player",
    "rugby player and coach",
    "rugby player for and",
    "rugby player and",
    "and rugby player",
    "rugby player",
    "Olympic bronze medal winning track cyclist",
    "Paralympic bronze medal winning cyclist",
    "Olympic silver medal winning cyclist",
    "bronze medal winning Olympic cyclist",
    "Olympic cyclist and silver medalist",
    "Olympic cyclist and cycling coach",
    "Olympic silver medallist cyclist",
    "Olympic cyclist and rally driver",
    "Olympic and professional cyclist",
    "Olympic silver medalist cyclist",
    "Olympic cyclist gold medalist",
    "cyclist and Olympic medallist",
    "speed skater and road cyclist",
    "Olympic silver medal cyclist",
    "long distance motorcyclist",
    "track cyclist and Olympian",
    "track and road cyclist",
    "Olympic track cyclist",
    "cross country cyclist",
    "long distance cyclist",
    "professional cyclist",
    "cyclist and Olympian",
    "swimmer and cyclist",
    "rally motorcyclist",
    "cyclist and coach",
    "endurance cyclist",
    "rower and cyclist",
    "Olympic cyclist",
    "track cyclist",
    "motorcyclist",
    "road cyclist",
    "BMX cyclist",
    "and cyclist",
    "cyclist",
]

sciences = [
    "biologist who made important contributions to fisheries science",
    "biologist and a pioneer in the field of vitro fertilization",
    "molecular biologist at the National Cancer Institute",
    "behavioral ecologist and evolutionary biologist",
    "and director of the Fish and Wildlife Service",
    "marine biologist and comparative physiologist",
    "microbiologist suspected of anthrax attacks",
    "Nobel Prize winning molecular biologist",
    "geneticist and developmental biologist",
    "ornithologist and molecular biologist",
    "geneticist and evolutionary biologist",
    "immunologist and molecular biologist",
    "cell biologist and cancer researcher",
    "molecular biologist and virologist",
    "ichthyologist and marine biologist",
    "geneticist and molecular biologist",
    "palaeobiologist and astrobiologist",
    "microbiologist and epidemiologist",
    "microbiologist and Nobel laureate",
    "biologist and plant physiologist",
    "cell and developmental biologist",
    "biologist and evolution theorist",
    "oneirologist and neurobiologist",
    "immunologist and microbiologist",
    "microbiologist and immunologist",
    "nutritionist and microbiologist",
    "biologist and environmentalist",
    "biologist and cryptozoologist",
    "biologist and skull collector",
    "microbiologist and geneticist",
    "microbiologist and virologist",
    "virologist and microbiologist",
    "molecular and cell biologist",
    "biologist and epistemologist",
    "botanist and field biologist",
    "botanist and microbiologist",
    "biologist and oceanographer",
    "ornithologist and biologist",
    "microbiologist and botanist",
    "biologist and physiologist",
    "microbiologist at Stanford",
    "who founded Monkey World",
    "biologist and geneticist",
    "biologist and researcher",
    "developmental biologist",
    "biologist and zoologist",
    "computational biologist",
    "biologist and ecologist",
    "World War II  biologist",
    "ecologist and biologist",
    "and molecular biologist",
    "wildlife biologist and",
    "evolutionary biologist",
    "medical microbiologist",
    "biologist and botanist",
    "reproductive biologist",
    "theoretical biologist",
    "agriculture biologist",
    "and marine biologist",
    "structural biologist",
    "veterinary biologist",
    "molecular biologist",
    "fisheries biologist",
    "radiation biologist",
    "microbiologist and",
    "cell biologist and",
    "wildlife biologist",
    "cellular biologist",
    "geomicrobiologist",
    "genetic biologist",
    "crocodile expert",
    "marine biologist",
    "chronobiologist",
    "plant biologist",
    "giraffe expert",
    "microbiologist",
    "cell biologist",
    "neurobiologist",
    "astrobiologist",
    "radiobiologist",
    "paleobiologist",
    "oncobiologist",
    "biologist and",
    "and biologist",
    "forester and",
    "biologist",
    "forester",
    "botanist and pioneer of plant ecology and environmental studies",
    "botanist specialising in South flora and fauna",
    "botanist and plant pathologist",
    "botanist and plant taxonomist",
    "botanist and plant ecologist",
    "botanist and plant collector",
    "botanist and lepidopterist",
    "Jesuit priest and botanist",
    "botanist and lichenologist",
    "botanist and pteridologist",
    "botanist and entomologist",
    "botanist and a mycologist",
    "botanist and taxonomist",
    "botanist and mycologist",
    "botanist and geneticist",
    "naturalist and botanist",
    "botanist and bryologist",
    "pharmacist and botanist",
    "botanist and ecologist",
    "biologist and botanist",
    "taxonomic botanist",
    "archaeobotanist",
    "palaeobotanist",
    "ethnobotanist",
    "paleobotanist",
    "botanist and",
    "and botanist",
    "botanist",
]

business_farming = ["sherry trader", "property magnate"]
academia_humanities = [
    "educationalist and university administrator",
    "and university administrator",
    "university vice chancellor",
    "and university president",
    "university administrator",
    "and university lecturer",
    "university president",
    "university executive",
    "university director",
    "university official",
    "university",
]
law_enf_military_operator = []
spiritual = [
    "Capuchin friar",
]
social = []
crime = []
event_record_other = []
other_species = [
    "royal cocker spaniel",  # before politics_govt_law
]

<IPython.core.display.Javascript object>

In [83]:
# Hard-coding cause_of_death for entries with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Ty_Longley"].index
df.loc[index, "cause_of_death"] = "nightclub fire"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Otis_Chandler"].index
df.loc[index, "cause_of_death"] = "Lewy body disease"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [84]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "business_farming": business_farming,
    "sciences": sciences,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "politics_govt_law": politics_govt_law,
    "academia_humanities": academia_humanities,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [85]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 6min 11s
Wall time: 6min 11s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
96075,13,Fabio Restrepo,", 62, Colombian actor , COVID-19.",https://en.wikipedia.org/wiki/Fabio_Restrepo_(actor),4,2022,February,",",,,COVID,,,,,,,,,62.0,,Colombia,,",",1.609438,0,0,0,0,0,1,0,0,0,0,0,0,1
5942,27,Samuel L. Devine,", 81, American politician, cancer.",https://en.wikipedia.org/wiki/Samuel_L._Devine,5,1997,June,,,,cancer,,,,,,,,,81.0,,United States of America,,,1.791759,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [86]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 22092 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [88]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [163]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [142]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "convicted" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [162]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [161]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "latinist" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [160]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "injustice claimant" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [103]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "Special Operations Executive agent" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [159]:
# # Example code to quick-check a specific entry
# df[
#     df["info_2"]
#     == 'convicted and later pardoned of being World War II propagandist "Tokyo Rose"'
# ]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [157]:
# Creating lists for each category
politics_govt_law = [
    "solicitor wrongly convicted of killing two of her sons",  # before crime
    "convicted and later pardoned of being World War II",
    "man wrongfully convicted of rape and abduction",
    "civil rights leader and a black nationalist",
    "World War II Vichy government official",
    "janitor wrongly convicted of murder",
    "attorney and civil rights leader",
    "labor and civil rights leader",
    "Romani civil rights leader",
    "mass media advocate",
    "civil rights leader",
    "injustice claimant",
    "anti fascist",
]

arts = [
    "cinematographer and two time Academy Award winner",
    "second unit director and cinematographer",
    "cinematographer and second unit director",
    "television producer and cinematographer",
    "cinematographer and camera operator",
    "music business executive and agent",  # before business_farming
    "cinematographer for Orson Welles",
    "cinematographer for the movie",
    "cinematographer and director",
    "director and cinematographer",
    "cinematographer and producer",
    "music business executive",
    "cinematographer Oscar",
    "and cinematographer",
    "cinematographer and",
    "cinematographer",
    "early video game designer",
    "video game designer and",
    "video game designer",
    "rapper better known as 'Proof' of D",
    "hardcore rapper and record producer",
    "rapper and harmonica player",
    "rapper and record producer",
    'propagandist "Tokyo Rose"',
    "rapper and Onyx affiliate",
    "rapper and TV presenter",
    "rapper and disc jockey",
    "rapper and producer",
    "Freestyle rapper",
    "rapper producer",
    "DJ and rapper",
    "rapper and DJ",
    "rapper and",
    "rapper",
]
sports = [
    "co owner of the New Nets",
    "trainer and owner of Thoroughbred racehorses",  # before other_species
    "Hall of Fame Thoroughbred racehorse trainer",
    "Hall of Fame racetrack and racehorse owner",
    "Thoroughbred racehorses trainer and owner",
    "Thoroughbred racehorse owner and breeder",
    "property developer and racehorse owner",
    "football club and racehorse owner",
    "thoroughbred racehorses trainer",
    "thoroughbred racehorse trainer",
    "Thoroughbred racehorse breeder",
    "hall of fame racehorse trainer",
    "Thoroughbred racehorse trainer",
    "jockey and racehorse trainer",
    "thoroughbred racehorse owner",
    "racehorse jockey and trainer",
    "racehorse trainer and owner",
    "racehorse owner and breeder",
    "racehorse trainer",
    "racehorse owner",
    "freeskier",
]
sciences = [
    "creator of the Game Boy",
]

business_farming = [
    "Hall of Fame business executive",
    "business executive and",
    "and business executive",
    "business executive",
]
academia_humanities = [
    "latinist",
]
law_enf_military_operator = [
    "Special Operations Executive agent in World War II",
    "World War II messenger pigeon trainer",
    "Resistance fighter during World War II",
    "Special Operations Executive agent",
    "jihadist and ISIL commander",
    "Resistance fighter",
    "jihadist militant",
    "jihadist",
]
spiritual = [
    "Hasidic rebbe",
    "Anglican priest and chaplain",
    "nun and Anglican priest",
    "Anglican priest and",
    "Anglican priest",
    "believed to be oldest living Catholic priest",
    "Catholic priest and marriage counselor",
    "excommunicated Hare Krishna leader",
    "Catholic priest and Sedevacantist",
    "Traditionalist Catholic priest",
    "Catholic priest and archbishop",
    "Catholic priest and missionary",
    "Catholic priest and exorcist",
    "Catholic priest and Jesuit",
    "Chaldean Catholic priest",
    "Jesuit Catholic priest",
    "Catholic priest and",
    "Catholic priest",
    "interfaith",
]
social = [
    "humanitarian aid worker and",
    "Wiradjuri humanitarian",
    "turned humanitarian",
]
crime = [
    "Klansman and convicted accomplice in the Freedom Summer murders",
    "gangster and suspected murderer of rapper Tupac Shakur",  # before arts
    "convicted of deporting Jews to death camps",
    "convicted war criminalconvicted fraudster",
    "convicted spammer and prison escapee",
    "convicted kidnapper and carjacker",
    "and convicted attempted murderer",
    "convicted of spying for the USSR",
    "convicted of drug trafficking in",
    "convicted of Lillehammer murder",
    "mobster and convicted racketeer",
    "convicted in Ipperwash shooting",
    "convicted child sex abuser and",
    "convicted in Watergate scandal",
    "convicted kidnapper and rapist",
    "and convicted state terrorist",
    "and convicted triple murderer",
    "and convicted drug trafficker",
    "convicted robber and murderer",
    "convicted of drug trafficking",
    "and convicted manslaughterer",
    "convicted child sex offender",
    "and convicted child abuser",
    "and convicted sex offender",
    "and convicted war criminal",
    "convicted double murderer",
    "convicted people smuggler",
    "convicted malice murderer",
    "convicted  and kidnapper",
    "and convicted fraudster",
    "and convicted terrorist",
    "and convicted fraudster",
    "man convicted of murder",
    "convicted serial rapist",
    "convicted mass murderer",
    "and convicted criminal",
    "convicted child rapist",
    "convicted war criminal",
    "convicted spree killer",
    "convicted child rapist",
    "convicted sex offender",
    "convicted extortionist",
    "convicted drug dealer",
    "convicted conspirator",
    "convicted of treason",
    "and convicted killer",
    "and convicted felon",
    "convicted fraudster",
    "convicted drug lord",
    "convicted terrorist",
    "convicted insurgent",
    "convicted kidnapper",
    "convicted criminal",
    "convicted procurer",
    "convicted con man",
    "convicted plotter",
    "convicted killer",
    "convicted felon",
    "convicted spy",
    "and convicted",
]
event_record_other = [
    "Holocaust survivor following escape from Sobibór",
    "Jewish Holocaust survivor",
    "Holocaust survivor and",
    "Holocaust survivor",
    "kidnap victim",
    "ISIS hostage",
]
other_species = [
    "thoroughbred racehorse and Hall of Fame inductee",
    "Thoroughbred racehorse and champion sire",
    "trained thoroughbred racehorse and sire",
    "Thoroughbred racehorse and active sire",
    "Thoroughbred racehorse and broodmare",
    "Hall of Fame Thoroughbred racehorse",
    "Hall of Fame thoroughbred racehorse",
    "thoroughbred racehorse and sire",
    "Thoroughbred racehorse and sire",
    "Champion Thoroughbred racehorse",
    "Hall of Fame racehorse and sire",
    "trained Thoroughbred racehorse",
    "trained thoroughbred racehorse",
    "champion racehorse and sire",
    "racehorse and sire old age",
    "racehorse and broodmare",
    "National Hunt racehorse",
    "Thoroughbred racehorse",
    "thoroughbred racehorse",
    "Standardbred racehorse",
    "Hall of Fame racehorse",
    "standardbred racehorse",
    "racehorse and sire",
    "champion racehorse",
    "trained racehorse",
    "harness racehorse",
    "racehorse",
]

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [158]:
# Combining separate lists into one dictionary
known_for_dict = {
    "politics_govt_law": politics_govt_law,
    "crime": crime,
    "arts": arts,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "law_enf_military_operator": law_enf_military_operator,
    "event_record_other": event_record_other,
    "sports": sports,
    "other_species": other_species,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [164]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 59s
Wall time: 2min


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
19239,10,Humbert Balsan,", 50, French film producer.",https://en.wikipedia.org/wiki/Humbert_Balsan,5,2005,February,,,,,,,,,,,,,50.0,,France,,,1.791759,0,0,0,0,0,1,0,0,0,0,0,0,1
26563,2,George Anselevicius,", 85, Lithuanian-born American architect.",https://en.wikipedia.org/wiki/George_Anselevicius,8,2008,October,,,,,,,,,,,,,85.0,,Lithuania,United States of America,,2.197225,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [165]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 21093 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [222]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [221]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [220]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "peer" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [219]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [218]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "executive and peer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [217]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "Royal Naval Volunteer Reserve officer" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [216]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "chief executive and peer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [215]:
# Creating lists for each category
politics_govt_law = [
    "Daughter of fascist dictator Benito Mussolini",
    "former dictator of",
    "dictator and",
    "dictator",
    "courtier of Queen Elizabeth II",
    "Minister of the Interior",
    "anti Zionist campaigner",
    "aristocrat and courtier",
    "aristocrat and Senator",
    "hereditary peer",
    "aristocrat and",
    "aristocrat",
    "hereditary peer of the Peerage of",
    "peer and former Northern Senator",
    "peer and Governor of the Bank of",
    "peer and son of  John Buchan",
    "and Conservative life peer",
    "and Liberal Democrat peer",
    "Labour MP and life peer",
    "barrister and life peer",
    "and hereditary peer",
    "peer and landowner",
    "landowner and peer",
    "peer and solicitor",
    "hereditary peer",
    "and a life peer",
    "life peer and",
    "and life peer",
    "life peer",
    "peeress",
    "peer and",
    "and peer",
    "peer",
]

arts = [
    "talent scout and agent known for her impact on",  # before social
    "modeling agent and scout",
    "model scout",
]
sports = [
    "track and field athlete who won four gold medals at the Summer Olympics",
    "track and field athlete and hammer throw world record holder",
    "athlete at the Summer Olympics and oldest surviving Olympian",
    "sport shooter and Olympic silver medal winning pentathlete",
    "Olympic gold medal winning modern pentathlete",
    "track and field athlete and Olympic champion",
    "track and field athlete and olympic champion",
    "athlete and Olympic gold medallist in discus",
    "athlete and the first Olympic gold medalist",
    "Olympic bronze medallist shot put athlete",
    "athlete and winner of the first Olympic m",
    "olympic athlete and long distance runner",
    "modern pentathlete and Olympic champion",
    "track athlete and Olympic gold medalist",
    "Olympic gold medal winning decathlete",
    "swimming coach and modern pentathlete",
    "Olympic modern pentathlete and fencer",
    "racing manager to Queen Elizabeth II",
    "Olympic bronze medal winning athlete",
    "track and field athlete and Olympian",
    "Olympic gold medal winning biathlete",
    "Olympic champion modern pentathlete",
    "triathlete and long distance runner",
    "centenarian track and field athlete",
    "Olympic gold medal winning athlete",
    "modern pentathlete and horse rider",
    "international athlete and Olympian",
    "athlete and Olympic hockey player",
    "Olympic athlete and administrator",
    "ski jumping athlete and official",
    "Olympic track and field athlete",
    "Olympic middle distance athlete",
    "masters track and field athlete",
    "Olympic bronze medalist athlete",
    "athlete and field hockey coach",
    "mountain biker and triathlete",
    "athlete and Olympic champion",
    "athlete and graphic designer",
    "high jumper and pentathlete",
    "Olympic modern pentathlete",
    "biathlon athlete and coach",
    "long jumper and decathlete",
    "Olympic athlete and coach",
    "Olympic long jump athlete",
    "Olympic decathlon athlete",
    "Olympic champion athlete",
    "athlete and book subject",
    "track and field athlete",
    "Senior Olympics athlete",
    "nonagenarian triathlete",
    "middle distance athlete",
    "and multi sport athlete",
    "modern pentathlete and",
    "Olympic track athlete",
    "Hall of Fame athlete",
    "athlete and Olympian",
    "steeplechase athlete",
    "college athlete and",
    "Olympic pentathlete",
    "racewalking athlete",
    "champion decathlete",
    "modern pentathlete",
    "Olympic decathlete",
    "Paralympic athlete",
    "paralympic athlete",
    "paraplegic athlete",
    "Olympic triathlete",
    "polo administrator",
    "athlete and coach",
    "Olympic biathlete",
    "versatile athlete",
    "Olympic athlete",
    "masters athlete",
    "Masters athlete",
    "college athlete",
    "sprint athlete",
    "track athlete",
    "field athlete",
    "heptathlete",
    "decathlete",
    "triathlete",
    "biathlete",
    "athlete and",
    "athlete",
    "Hall of Fame cricket player and coach",
    "cricket player and administrator",
    "cricket player and manager",
    "cricket player and umpire",
    "cricket player and coach",
    "test cricket player",
    "cricket player and",
    "cricket player",
    "master archer",
    "baseball scout and executive",  # before social
    "ice hockey coach and scout",
    "football manager and scout",
    "football scout",
    "Baseball scout",
    "baseball scout",
]
sciences = [
    "microbiology",
]

business_farming = [
    "former chairman of Rolls Royce",
    "proprietor of the",
]
academia_humanities = [
    "President of Bar Ilan University",
    "Bible researcher",
    "rosh yeshiva",
    "and lecturer",
    "tour guide",
    "lecturer",
]
law_enf_military_operator = [
    "chief military",
    "Army officer during World War II and Medal of Honor recipient",
    "Army officer and recipient of the Param Vir Chakra",
    "Army officer and recipient of the Maha Vir Chakra",
    "Army officer and recipient of the Medal of Honor",
    "Army officer associated with the Mỹ Lai massacre",
    "Army officer and Chief of the General Staff",
    "Royal Naval Volunteer Reserve officer",
    "Army officer and Commander in Chief",
    "th Army officer and Chief of Staff",
    "Army officer and OSS CIA operative",
    "Army officer during World War II",
    "Army officer and Chief of Staff",
    "People Liberation Army officer",
    "World War II Army officer",
    "Army officer and military",
    "and Army officer",
    "Army officer and",
    "Army officer",
    "cipher clerk",
]
spiritual = [
    "Orthodox rabbi;",
    "renowned Sephardic Orthodox rabbi and kabbalist",
    "Chasidic rabbi of the Boston Hasidic dynasty",
    "rabbi and founder of Masorti movement",
    "Orthodox Religious Zionist rabbi and",
    "rabbi of the Western Wall for years",
    "Breslover Hasid and rabbi",
    "chief rabbi of Cincinnati",
    "Orthodox rabbi and posek",
    "settler and chief rabbi",
    "zionist orthodox rabbi",
    "Salvation Army officer",
    "Orthodox Jewish rabbi",
    "rabbi and sect leader",
    "Chabad Chasidic rabbi",
    "chief rabbi of Haifa",
    "Conservative rabbi",
    "rabbi and chaplain",
    "and Orthodox rabbi",
    "Orthodox rabbi and",
    "rabbi and settler",
    "Hasidic rabbi and",
    "Haredi rabbi and",
    "Sephardic rabbi",
    "Orthodox rabbi",
    "orthodox rabbi",
    "Hasidic rabbi",
    "Haredi rabbi",
    "Reform rabbi",
    "Chabad rabbi",
    "reform rabbi",
    "Jewish rabbi",
    "rabbi and",
    "and rabbi",
    "rabbi",
]
social = [
    "Chief Scout Executive of the Boy Scouts of",
    "deputy chief scout of Scouts",
    "scouting leader",
    "scout leader",
    "Chief Scout",  # before politics_govt_law
    "scouter",
    "scout",
]
crime = []
event_record_other = [
    "holocaust survivor",
]
other_species = [
    "rabbit and book subject",  # before spiritual
]

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [214]:
# Combining separate lists into one dictionary
known_for_dict = {
    "arts": arts,
    "sports": sports,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [223]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 3min 6s
Wall time: 3min 6s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
90276,24,Habibullah Siraji,", 72, Bangladeshi poet, director general of the Bangla Academy .",https://en.wikipedia.org/wiki/Habibullah_Siraji,11,2021,May,since,,,director general of the Bangla Academy,,,,,,,,,72.0,,Bangladesh,,since 2018,2.484907,0,0,0,0,0,1,0,0,0,0,0,0,1
85780,7,Dawn Lindberg,", 75, South African folk singer, actress and theatre producer, COVID-19.",https://en.wikipedia.org/wiki/Dawn_Lindberg,9,2020,December,,,,actress and theatre producer,COVID,,,,,,,,75.0,,South Africa,,,2.302585,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [224]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 20231 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [333]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [332]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [331]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "bursar" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [330]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [329]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "admiral and Black Rod"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [335]:
# Creating lists for each category
politics_govt_law = [
    "governor of the North West Frontier Province",
    "marijuana advocate",
    "language proponent",
    "FEMA director",
    "political figure and a dissident",
    "and political figure",
    "political figure and ",
    "political figure",
    "and public health whistleblower",
    "congressional aide and oil industry whistleblower",
    "nuclear espionage whistleblower",
    "government whistleblower",
    "and whistleblower",
    "whistleblower",
    "clan chief",
    "Royal courtier",
    "courtier and",
    "and courtier",
    "courtier",
    "Master of the Household to the Sovereign",
    "Marshal of the Diplomatic Corps",
    "colonial administrator and Governor of Northern",
    "and a colonial administrator",
    "and colonial administrator",
    "colonial administrator",
    "Governor of Western",
    "and Black Rod",
    "Black Rod",
]

arts = [
    "jazz tenor saxophonist and musical arranger",
    "jazz tenor saxophonist and jazz club owner",
    "hard bop and post bop tenor saxophonist",
    "jazz alto saxophonist and clarinetist",
    "jazz tenor saxophonist and flautist",
    "jazz and big band tenor saxophonist",
    "jazz saxophonist and musicologist",
    "saxophonist and free jazz pioneer",
    "jazz saxophonist and clarinetist",
    "jazz saxophonist and band leader",
    "Grammy Award winning saxophonist",
    "alto saxophonist and clarinetist",
    "hard bop jazz tenor saxophonist",
    "record producer and saxophonist",
    "jazz saxophonist and bandleader",
    "jazz tenor and alto saxophonist",
    "jazz saxophonist and flautist",
    "jazz saxophonist and flutist",
    "saxophonist and clarinetist",
    "jazz and blues saxophonist",
    "jazz baritone saxophonist",
    "Hall of Fame saxophonist",
    "saxophonist and flautist",
    "jazz and R&B saxophonist",
    "saxophonist and arranger",
    "funk and R&B saxophonist",
    "saxophonist and vocalist",
    "flautist and saxophonist",
    "jazz tenor saxophonist",
    "jazz alto saxophonist",
    "broadcasting adviser",
    "street saxophonist",
    "tenor saxophonist",
    "blues saxophonist",
    "jazz saxophonist",
    "alto saxophonist",
    "rock saxophonist",
    "R&B saxophonist",
    "saxophonist and",
    "saxophonist",
    "clown and children television personality",
    "clown and son of Emmett Kelly",
    "clown and television host",
    "clown and circus director",
    "baseball clown",
    "rodeo clown",
    "and clown",
    "clown",
    "television producer and media executive",
    "Emmy Award winning television producer",
    "motion picture and television producer",
    "television producer and music director",
    "game show host and television producer",
    "television producer and script editor",
    "television producer and talent agent",
    "entertainer and television producer",
    "television producer and executive",
    "puppeteer and television producer",
    "television producer and director",
    "caster and television producer",
    "radio and television producer",
    "television producer and host",
    "and television producer",
    "television producer and",
    "television producer",
]
sports = [
    "basketball coach of the University of Tennessee Volunteers",
    "Hall of Fame college basketball coach and administrator",
    "college basketball coach and athletic administrator",
    "college basketball coach for Wake Forest University",
    "college basketball coach and athletic director",
    "basketball coach for the Harlem Globetrotters",
    "NBA player and high school basketball coach",
    "professional basketball coach and executive",
    "basketball coach and athletic administrator",
    "women basketball coach at Military Academy",
    "Hall of Fame high school basketball coach",
    "basketball coach and athletics director",
    "Hall of Fame college basketball coach",
    "basketball coach and baseball coach",
    "college basketball coach and player",
    "basketball coach and executive",
    "Hall of Fame basketball coach",
    "football and basketball coach",
    "baseball and basketball coach",
    "high school basketball coach",
    "basketball coach and referee",
    "college basketball coach and",
    "tennis and basketball coach",
    "basketball coach and player",
    "college basketball coach",
    "women basketball coach",
    "NBA basketball coach",
    "and basketball coach",
    "basketball coach",
    "Olympic water polo player and swimmer",
    "Olympic water polo player and",
    "Olympic water polo player",
    "wilderness guide",
    "fly fisherman",
    "Olympic bronze medal winning equestrian",
    "equestrian and Olympic champion",
    "javelin thrower and equestrian",
    "equestrian at the Asian Games",
    "Olympic champion equestrian",
    "national equestrian coach",
    "Olympic equestrian rider",
    "Hall of Fame equestrian",
    "Olympic equestrian and",
    "paralympic equestrian",
    "Paralympic equestrian",
    "equestrian competitor",
    "dressage equestrian",
    "Olympic equestrian",
    "equestrian eventer",
    "equestrian and",
    "equestrian",
    "hurler and hurling manager",
    "and hurler",
    "hurler ·",
    "hurler",
]
sciences = [
    "psychiatrist specializing in psychic phenomena",
    "psychiatrist and developer of reality therapy",
    "psychiatrist and reincarnation researcher",
    "psychiatrist and pioneer LSD experimenter",
    "psychiatrist and Tourette syndrome expert",
    "child psychiatrist and Jungian analyst",
    "psychiatrist and addiction specialist",
    "psychiatrist and science communicator",
    "psychiatrist and student of Carl Jung",
    "psychiatrist and research director",
    "psychiatrist and dream researcher",
    "psychiatrist and sleep researcher",
    "psychiatrist and psychotherapist",
    "psychiatrist and psychoanalyst",
    "neurologist and psychiatrist",
    "psychiatrist and neurologist",
    "sexologist and psychiatrist",
    "orthomolecular psychiatrist",
    "psychiatrist and researcher",
    "internist and psychiatrist",
    "developmental psychiatrist",
    "experimental psychiatrist",
    "psychiatrist known as the",
    "psychiatrist and surgeon",
    "forensic psychiatrist",
    "child psychiatrist",
    "neuropsychiatrist",
    "psychiatrist and",
    "and psychiatrist",
    "psychiatrist",
    "ornithologist and naturalist",
    "naturalist and",
    "and naturalist",
    "naturalist",
    "nursing researcher",
]

business_farming = []
academia_humanities = ["and philatelist", "philatelist", "and bursar", "bursar"]
law_enf_military_operator = [
    "army colonel and",
    "and army colonel",
    "army colonel",
    "senior army officer and Commissioner of the Federal Police",
    "World War II army officer and Military Cross recipient",
    "army officer and World War II prisoner of war escapee",
    "WWII army officer and Military Cross recipient",
    "army officer and intelligence analyst",
    "army officer and World War II veteran",
    "army officer and Chief of Staff",
    "World War II army officer",
    "military  army officer",
    "WWII army officer",
    "and army officer",
    "army officer and",
    "army officer",
    "retired Navy vice admiral and Pentagon official",
    "Navy vice admiral and Medal of Honor recipient",
    "four star admiral and World War II aviator",
    "admiral who was Commander in Chief in the",
    "World War II veteran and rear admiral",
    "vice admiral and naval secretary",
    "admiral and Chief of Naval Staff",
    "admiral and resistance fighter",
    "four star admiral in the Navy",
    "admiral and naval aviator",
    "naval rear admiral",
    "Navy vice admiral",
    "Navy rear admiral",
    "navy vice admiral",
    "four star admiral",
    "navy rear admiral",
    "vice admiral and",
    "vice admiral",
    "Navy admiral",
    "rear admiral",
    "navy admiral",
    "admiral and",
    "and admiral",
    "admiral",
]
spiritual = [
    "Buddhist Sangharaja and Nobel Peace Prize nominee",
    "meditation master and Buddhist monk",
    "Buddhist spiritual leader",
    "Theravada Buddhist monk",
    "Sinhalese Buddhist monk",
    "Mahayana Buddhist monk",
    "Tuvan Buddhist lama",
    "Buddhist Zen master",
    "Zen Buddhist priest",
    "Buddhist missionary",
    "Zen Buddhist monk",
    "Buddhist monk and",
    "Buddhist prelate",
    "Buddhist clergy",
    "Buddhist abbess",
    "Buddhist leader",
    "Buddhist priest",
    "Buddhist monk",
    "Buddhist  nun",
    "Shin Buddhist",
    "Buddhist Lama",
    "Buddhist nun",
    "and Buddhist",
    "Buddhist",
    "member of the Church of Scientology",
    "and critic of Scientology",
    "Scientology",
]
social = [
    "humanitarian and",
    "and humanitarian",
    "humanitarian",
    "aid worker credited who saved over Jewish children during World War II",
    "World Health Organization aid worker",
    "aid worker and",
    "and aid worker",
    "aid worker",
]
crime = []
event_record_other = [
    "killing spree victim",
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [336]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [337]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 4min 7s
Wall time: 4min 8s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
37744,14,Ray Barlow,", 85, English footballer .",https://en.wikipedia.org/wiki/Ray_Barlow,9,2012,March,West Bromwich Albion,,,,,,,,,,,,85.0,,United Kingdom of Great Britain and Northern Ireland,,West Bromwich Albion,2.302585,0,0,0,0,0,0,1,0,0,0,0,0,1
44008,10,Walter McCaffrey,", 64, American activist and politician , complications from a traffic collision.",https://en.wikipedia.org/wiki/Walter_McCaffrey,3,2013,July,"City Council,",,,complications from a traffic collision,,,,,,,,,64.0,,United States of America,United States of America,"City Council, 1985 2001",1.386294,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [338]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 19018 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [412]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [411]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [410]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "turned informant" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [409]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [408]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "criminal law"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [413]:
# Creating lists for each category
politics_govt_law = [
    "criminal defendant diagnosed with multiple personality disorder",
    "World War II conscientious objector  with War Resisters League",
    "fascist leader of the Independent State of in World War II",
    "private secretary of Adolf Hitler during World War II",
    "campaigner for safety glass",
    "criminal defense attorney",  # before crime
    "Health Minister in",
    "figure and civic leader",
    "anti war campaigner",
    "civic leader and ",
    "and civic leader",
    "heir and member",
    "criminal law",
    "civic leader",
]

arts = [
    "radio and TV presenter",
    "TV presenter of MTV",
    "and TV presenter",
    "TV presenter",
]
sports = [
    "disability sport administrator",
    "Olympic gold medal winning marathon runner",
    "marathon runner and athletics coach",
    "and marathon runner",
    "Olympic marathon runner",
    "ultramarathon runner",
    "marathon runner",
    "owner of Coolmore Stud",
]
sciences = [
    "orthopaedic surgeon and pioneer in combatting polio throughusing penicillin",
    "veterinary surgeon who was the inspiration for",
    "surgeon and ulcerative colitis surgery pioneer",
    "surgeon and inventor of parenteral nutrition",
    "cardiovascular surgeon and medical pioneer",
    "surgeon and vascular surgery pioneer",
    "medical practitioner and surgeon",
    "podiatric surgeon and ufologist",
    "and renowned plastic surgeon",
    "pediatric orthopedic surgeon",
    "medical doctor and surgeon",
    "consultant cardiac surgeon",
    "oral and plastic surgeon",
    "paediatric neurosurgeon",
    "cardio thoracic surgeon",
    "cardiovascular surgeon",
    "surgeon and oncologist",
    "hepatobiliary surgeon",
    "World War II surgeon",
    "college  and surgeon",
    "plastic surgeon and",
    "cardiac surgeon and",
    "orthopaedic surgeon",
    "orthopedic surgeon",
    "transplant surgeon",
    "doctor and surgeon",
    "veterinary surgeon",
    "military surgeon",
    "vascular surgeon",
    "thoracic surgeon",
    "neurosurgeon and",
    "cardiac surgeon",
    "plastic surgeon",
    "pioneer surgeon",
    "trauma surgeon",
    "cancer surgeon",
    "dental surgeon",
    "naval surgeon",
    "neurosurgeon",
    "hand surgeon",
    "Army surgeon",
    "eye surgeon",
    "war surgeon",
    "surgeon and",
    "and surgeon",
    "surgeon",
    "alleged doctor and",
    "alleged doctor",
]

business_farming = []
academia_humanities = [
    "founder of St Antony College",
]
law_enf_military_operator = [
    "Army Air Forces airman and recipient of the Medal of Honor for his actions in World War II",
    "fighter ace during World War II and recipient of the Knight Cross of the Iron Cross",
    "fighter ace and recipient of the Knight Cross of the Iron Cross during World War II",
    "flying ace and recipient of the Knight Cross of the Iron Cross during World War II",
    "flying ace during World War II and recipient of the Knight Cross of the Iron Cross",
    "World War II veteran and hero of the battle for the Hurtgen Forest on November",
    "nazi collaborator during World War II and founding member of Front National",
    "military aviator and member of the Tuskegee Airmen during World War II",
    "SS officer and Auschwitz concentration camp doctor during World War II",
    "Navy submarine commander and Medal of Honor recipient in World War II",
    "submarine commander awarded the Medal of Honor during World War II",
    "World War II dissident who led Jews over the Pyrenees to freedom",
    "who save hundreds of Jews from the Holocaust during World War II",
    "pilot during World War II and later an officer in the Air Force",
    "pilot and navigator during World War II and Hero of the Union",
    "flying ace during World War II and record setting test pilot",
    "Air Force officer and squadron commander during World War II",
    "intelligence officer and SOE operative during World War II",
    "air marshal and an ace nightfighter pilot in World War II",
    "Medal of Honor recipient for actions during World War II",
    "Navy officer and destroyer commander during World War II",
    "naval officer and U boat commander during World War II",
    "communist leader of the Resistance during World War II",
    "officer and Colditz Castle escapee during World War II",
    "World War II veteran and recipient of the Silver Star",
    "Air Force officer and flying ace during World War II",
    "fighter pilot who served in the during World War II",
    "Air Defence Forces officer and World War II veteran",
    "rifleman with the Marine Corps during World War II",
    "U boat commander of the sunken during World War II",
    "Air Force pilot and flying ace during World War II",
    "World War II veteran and Medal of Honor recipient",
    "Navy officer and World War II submarine commander",
    "Waffen SS member and official during World War II",
    "ace during World War II and Iron Cross recipient",
    "fighter pilot and flying ace during World War II",
    "sniper during World War II and Hero of the Union",
    "flying ace during the Civil War and World War II",
    "y commander in the Waffen SS during World War II",
    "fighter ace of the Air Force during World War II",
    "Special Operations Executive during World War II",
    "World War II commanding officer of Easy Company",
    "planner of the Great Escape during World War II",
    "Imperial Army World War II intelligence officer",
    "SS officer and war criminal during World War II",
    "commander of the Waffen SS during World War II",
    "RAF officer and flying ace during World War II",
    "Air marshal and flying ace during World War II",
    "Army Air Forces flying ace during World War II",
    "World War II Jewish  fighter and anti avenger",
    "World War II ambulance driver and interpreter",
    "fighter pilot during World War II and the War",
    "Air Force officer and World War II flying ace",
    "member of the Resistance during World War II",
    "World War II veteran acquitted of war crimes",
    "Navy submarine commander during World War II",
    "bomb and mine specialist during World War II",
    "commander and flying ace during World War II",
    "Marine Corps infantryman during World War II",
    "Vice Admiral in the Navy during World War II",
    "officer in the Wehrmacht during World War II",
    "Navajo code talker and World War II veteran",
    "U boat commander in the during World War II",
    "military volunteer and World War II veteran",
    "and pilot and navigator during World War II",
    "leader of the Bielski s during World War II",
    "fighter ace of the RAAF during World War II",
    "World War II Tuskegee Airman fighter pilot",
    "Airforce Service pilot during World War II",
    "officer and flying ace during World War II",
    "Navajo prisoner of war during World War II",
    "Navy dive bomber pilot during World War II",
    "Navy officer during and after World War II",
    "RAF fighter pilot during World War II and",
    "and Chetnik commander during World War II",
    "air marshal and World War II bomber pilot",
    "Air Forces flying ace during World War II",
    "World War II fighter pilot and flying ace",
    "Oberstleutnant in the during World War II",
    "decorated bomber ace during World War II",
    "Seminole Code Talker during World War II",
    "Air Force flying ace during World War II",
    "and anti resisister during World War II",
    "and prisoner of war during World War II",
    "night fighter pilot during World War II",
    "codebreaker at Park during World War II",
    "submarine commander during World War II",
    "resistance fighter during World War II",
    "World War II bomber pilot and war hero",
    "military commander during World War II",
    "Wehrmacht officer during World War II",
    "World War II Special Operations agent",
    "Royal Marine Commando in World War II",
    "Resistance member during World War II",
    "flight lieutenant during World War II",
    "World War II Medal of Honor recipient",
    "World War II non commissioned officer",
    "war correspondent during World War II",
    "paratroop officer during World War II",
    "night fighter ace during World War II",
    "resistance leader during World War II",
    "U boat commander during World War II",
    "World War II anti Hitler conspirator",
    "military frogman during World War II",
    "flight navigator during World War II",
    "Resistance agent during World War II",
    "of the Waffen SS during World War II",
    "medical orderly during World War II",
    "World War II fighter and test pilot",
    "nazi camp guard during World War II",
    "freedom fighter during World War II",
    "communist and World War II  fighter",
    "test pilot and World War II veteran",
    "World War II veteran and fundraiser",
    "and SS captain during World War II",
    "highly decorated World War II hero",
    "Navajo code talker in World War II",
    "World War II and War fighter pilot",
    "military  during World War II and",
    "fighter pilot during World War II",
    "naval officer during World War II",
    "World War II Secret Service agent",
    "y fighter ace during World War II",
    "World War II Spitfire fighter ace",
    "pilot and World War II flying ace",
    "naval aviator during World War II",
    "Commando during World War II and",
    "collaborator during World War II",
    "SS commander during World War II",
    "distinguished World War II pilot",
    "pilot in the during World War II",
    "U boat commander in World War II",
    "bomber pilot during World War II",
    "submariner and World War II hero",
    "fighter ace during World War II",
    "World War II resistance fighter",
    "codebreaker during World War II",
    "field medic during World War II",
    "World War II nightfighter pilot",
    "Navajo World War II code talker",
    "World War II fighter pilot and",
    "and World War II fighter pilot",
    "flying ace during World War II",
    "World War II Resistance member",
    "SS officer during World War II",
    "World War II resistance leader",
    "World War II resistance worker",
    "WASP pilot during World War II",
    "World War II Waffen SS officer",
    "pilot and World War II veteran",
    "World War II air force officer",
    "World War II spy for the Union",
    "World War II Navy fighter ace",
    "SOE agent during World War II",
    "officer and World War II hero",
    "SEO agent during World War II",
    "World War II RAF airman and ",
    "World War II Air Force pilot",
    "World War II veteran and war",
    "and World War II flying ace",
    "fighter ace in World War II",
    "World War II military pilot",
    "World War II tank commander",
    "naval  World War II veteran",
    "World War II Panzer captain",
    "y pilot during World War II",
    "officer during World War II",
    "World War II fighter pilot",
    "World War II naval officer",
    "World War II cryptographer",
    "pilot during World War II",
    "World War II glider pilot",
    "Army  during World War II",
    "World War II veteran and",
    "and World War II veteran",
    "and World War II veteran",
    "World War II fighter ace",
    "World War II codebreaker",
    "World War II RAF officer",
    "World War II paratrooper",
    "World War II flying ace",
    "spy during World War II",
    "World War II air gunner",
    "officer in World War II",
    "World War II RAF airman",
    "World War II submariner",
    "World War II Flying ace",
    "World War II navigator",
    "World War II pilot ace",
    "World War II combatant",
    "World War II hero and",
    "and World War II hero",
    "World War II commando",
    "World War II spy and",
    "nun and World War II",
    "World War II veteran",
    "World War II General",
    "World War II officer",
    "World War II aviator",
    "World War II air ace",
    "World War II airman",
    "World War II marine",
    "World War II during",
    "during World War II",
    "World War II pilot",
    "World War II Army",
    "World War II ace",
    "turned informant",
    "World War II",
    "criminalist",  # before crime
]


spiritual = [
    "evangelical preacher and missionary",
    "Catholic bishop and missionary",
    "Catholic missionary and bishop",
    "Jesuit priest and missionary",
    "Mormon leader and missionary",
    "pastor and missionary",
    "priest and missionary",
    "Christian missionary",
    "Catholic missionary",
    "Salesian missionary",
    "catholic missionary",
    "Jesuit missionary",
    "Mormon missionary",
    "missionary and",
    "and missionary",
    "missionary in",
    "missionary",
]
social = []
crime = [
    "criminal and inmate of Alcatraz Penitentiary",
    "member of the Camorra criminal organisation",
    "criminal and twin brother of Reggie Kray",
    "SS officer and war criminal",
    "suspected war criminal",
    "criminal and fugitive",
    "criminal and smuggler",
    "criminal and gangster",
    "alleged war criminal",
    "computer criminal",
    "pardoned criminal",
    "Serb war criminal",
    "serial criminal",
    "cyber criminal",
    "war criminal",
    "and criminal",
    "criminal and",
    "criminalist",
    "criminal",
]
event_record_other = [
    "Siberian gulag survivor",
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [415]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "crime": crime,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [416]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 4min 42s
Wall time: 4min 42s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
94426,4,Mohammed Inuwa Wushishi,", 81, Nigerian general, chief of Army staff .",https://en.wikipedia.org/wiki/Mohammed_Inuwa_Wushishi,8,2021,December,,,,chief of Army staff,,,,,,,,,81.0,,Nigeria,,1981 1983,2.197225,0,0,0,0,0,0,0,1,0,0,0,0,1
4778,29,Robert Levin,", 84, Norwegian classical pianist and composer.",https://en.wikipedia.org/wiki/Robert_Levin_(Norwegian_pianist),13,1996,October,,,,,,,,,,,,,84.0,,Norway,,,2.639057,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [417]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 18400 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [527]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [526]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [525]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "and later a nationalist" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [523]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [522]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "social worker and later a nationalist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [528]:
# Creating lists for each category
politics_govt_law = [
    "attorney and one of the first female District Judges",
    "attorney and Associate Justice of the Supreme Court",
    "disability rights advocate and attorney",
    "attorney and political campaign manager",
    "attorney who represented James Earl Ray",
    "civil rights attorney and Circuit Judge",
    "attorney and mentor to Erin Brockovich",
    "entertainment attorney and litigator",
    "attorney and death penalty advocate",
    "attorney and civil rights advocate",
    "attorney and intelligence expert",
    "attorney and bonsai enthusiast",
    "attorney and tax policy expert",
    "state legislator and attorney",
    "attorney and defense expert",
    "disability rights attorney",
    "real estate attorney and",
    "attorney and billionaire",
    "First Amendment attorney",
    "civil rights attorney",
    "prosecuting attorney",
    "defense attorney",
    "labor attorney",
    "and attorney",
    "attorney and",
    "attorney",
    "neoconservative advocate and",
    "traditionalist conservative",
    "conservative campaigner",
    "conservative",
    "spokesman in parliament",
    "Politician",
    "State propagandist",
    "Baloch nationalist leader",
    "black nationalist leader",
    "and later a nationalist",
    "and Islamist",
    "Islamist",
    "coupist",
    "far left",
    "neo nazi and",
    "neo nazi",
    "nationalist",
]

arts = [
    "fashion model and beauty queen",
    "beauty queen and st Miss",
    "model and beauty queen",
    "child beauty queen",
    "beauty queen and",
    "and beauty queen",
    "beauty queen",
    "newspaper columnist for the San Francisco Chronicle",
    "newspaper columnist and radio television critic",
    "television personality and magazine columnist",
    "crossword compiler and advice columnist",
    "newspaper correspondent and columnist",
    "columnist and television personality",
    "advice columnist and media celebrity",
    "syndicated gossip columnist for the",
    "newspaper columnist and humorist",
    "Pulitzer Prize winning columnist",
    "columnist and newspaper director",
    "syndicated newspaper columnist",
    "blogger and magazine columnist",
    "editor and humor columnist",
    "and newspaper columnist",
    "entertainment columnist",
    "humorist and columnist",
    "columnist and blogger",
    "newspaper columnist",
    "magazine columnist",
    "tabloid columnist",
    "gossip columnist",
    "advice columnist",
    "humor columnist",
    "and columnist",
    "columnist",
    "record producer and member of the Country Music Hall of Fame",
    "record producer who discovered Sinéad O'Connor and The Pale",
    "record producer and production company executive",
    "Hall of Fame music executive and record producer",
    "record producer and founder of Barclay Records",
    "two time Grammy Award winning record producer",
    "record producer and member of Dschinghis Khan",
    "record producer and record company executive",
    "record company executive and record producer",
    "multi instrumentalist and record producer",
    "jazz music impresario and record producer",
    "radio station owner and record producer",
    "jazz double bassist and record producer",
    "record producer and record label owner",
    "record producer and company executive",
    "Grammy Award winning record producer",
    "record producer and musical producer",
    "jazz trombonist and record producer",
    "defence analyst and record producer",
    "record producer and music executive",
    "record producer and band manager",
    "disc jockey and record producer",
    "bass player and record producer",
    "hip hop record producer and DJ",
    "hip hop record producer and MC",
    "record producer and executive",
    "Hall of Fame record producer",
    "record producer and promoter",
    "record producer and arranger",
    "record producer and lyricist",
    "jazz and R&B record producer",
    "bassist and record producer",
    "record producer and manager",
    "manager and record producer",
    "punk rock record producer",
    "hip hop record producer",
    "DJ and record producer",
    "record producer and DJ",
    "jazz record producer",
    "R&B record producer",
    "and record producer",
    "record producer and",
    "record producer",
    "harmonium and tabla player",
    "and lutenist",
    "lutenist",
]
sports = [
    "long distance swimmer and first woman to swim the Channel in both directions",
    "Olympic silver and bronze medal winning swimmer",
    "competition swimmer and Olympic silver medalist",
    "Olympic gold and silver medal winning swimmer",
    "swimmer and first Olympic swimming medallist",
    "swimmer and first woman to swim the Channel",
    "backstroke swimmer and world record holder",
    "triple gold medal winning Olympic swimmer",
    "Paralympian swimmer and wheelchair racer",
    "backstroke swimmer and Olympic champion",
    "Olympic swimmer and water polo player",
    "water polo player and Olympic swimmer",
    "Olympic silver medal winning swimmer",
    "swimmer and Olympic bronze medalist",
    "Olympic gold medal winning swimmer",
    "long distance swimmer and Olympian",
    "Olympic swimmer and swimming coach",
    "Olympic champion freestyle swimmer",
    "Olympic swimmer and national coach",
    "swimmer and Olympic gold medalist",
    "Olympic silver medalist swimmer",
    "swimmer ad world record holder",
    "Olympic long distance swimmer",
    "swimmer and Olympic champion",
    "swimmer and Olympic medalist",
    "Olympic breaststroke swimmer",
    "Olympic synchronised swimmer",
    "swimmer and swimming coach",
    "Olympic backstroke swimmer",
    "Olympic champion swimmer",
    "long distance swimmer",
    "Hall of Fame swimmer",
    "swimmer and Olympian",
    "Olympic gold swimmer",
    "breaststroke swimmer",
    "competition swimmer",
    "competitive swimmer",
    "paralympian swimmer",
    "Paralympic swimmer",
    "backstroke swimmer",
    "freestyle swimmer",
    "swimmer and coach",
    "butterfly swimmer",
    "swimmer and diver",
    "marathon swimmer",
    "Olympic swimmer",
    "Masters swimmer",
    "era swimmer",
    "swimmer",
]
sciences = [
    "medical practitioner",
]

business_farming = [
    "gambling tycoon",
]
academia_humanities = [
    "grammarian",
    "musicologist and expert on Erik Satie",
    "ethnomusicologist and musicologist",
    "folklorist and enthomusicologist",
    "musicologist and ethnographer",
    "hymnologist and musicologist",
    "musicologist and folklorist",
    "folklorist and musicologist",
    "librarian and musicologist",
    "theorist and musicologist",
    "and ethnomusicologist",
    "ethnomusicologist and",
    "ethnomusicologist",
    "and musicologist",
    "musicologist and",
    "musicologist",
    "Islamicist",  # before spiritual
    "Scholer and",
]
law_enf_military_operator = [
    "militant leader of the Vilayat Dagestan",
    "revolutionary and  militant",
    "militant in al Qaeda wing",
    "independentist militant",
    "militant and bodyguard",
    "independence militant",
    "nationalist militant",
    "Montoneros militant",
    "militant separatist",
    "extremist militant",
    "militant commander",
    "Lehi militant and",
    "Taliban militant",
    "militant leader",
    "ISIS militant",
    "pro militant",
    "and militant",
    "militant",
    "and leader of the Boricua Popular Army",
    "and leader of the Grey Wolves",
]
spiritual = [
    "tarot card reader",
    "Catholic cardinal and former archbishop of Manila",
    "Catholic cardinal and archbishop",
    "Coptic Catholic cardinal",
    "Catholic cardinal",
    "Islamic  religious leader",
    "Islamic spiritual leader",
    "Arabian imam and Islamic",
    "Islamic religious leader",
    "Arabian Islamic cleric",
    "and Islamic preacher",
    "Shi'a Islamic leader",
    "Tatar Islamic cleric",
    "Islamic Sufi leader",
    "Islamic leader and",
    "Islamic preacher",
    "Arabian Islamic",
    "Islamic  cleric",
    "Islamic science",
    "Islamic cleric",
    "Islamic leader",
    "Islamic legal",
    "Islamic",
    "Sunni Muslim  mufti",
    "Sunni Arab cleric",
    "Sunni Muslim",
    "Sunni Islam",
    "Sunni",
]
social = []
crime = [
    "terrorist and bomb maker",
    "stock market fraudster",
    "fraudster and kidnapper",
    "suspected fraudster",
    "game show fraudster",
    "fraudster",
]
event_record_other = [
    "civilian kidnapped and murdered by militants in the West Bank city of Ramallah",  # before law_enfor_military_operator
    "Islamic jihad hostage",  # before spiritual
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [529]:
# Combining separate lists into one dictionary
known_for_dict = {
    "event_record_other": event_record_other,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [530]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 2min 26s
Wall time: 2min 27s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
97806,21,Tota Singh,", 81, Indian politician, Punjab MLA , complications from pneumonia.",https://en.wikipedia.org/wiki/Tota_Singh,7,2022,May,",",,,Punjab MLA,complications from pneumonia,,,,,,,,81.0,,India,,"1997 2007, 2012 2017",2.079442,0,0,0,0,0,0,0,0,1,0,0,0,1
20520,4,Earl Krugel,", 62, American JDL activist and convicted criminal, prison assault.",https://en.wikipedia.org/wiki/Earl_Krugel,9,2005,November,,,,prison assault,,,,,,,,,62.0,,United States of America,,,2.302585,0,0,0,0,0,0,0,0,1,1,0,0,2


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [531]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 17524 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [675]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [676]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [677]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "fan" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [678]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [679]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "football fan"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [673]:
# Creating lists for each category
politics_govt_law = [
    "television anchorman and political",
    "political strategist and",
    "political strategist",
]

arts = [
    "involved in Madoff investment scandal",
    "YouTube personality",
    "television  analyst",
    "oenophile",
    "television and radio caster and game show host",
    "television and radio caster",
    "radio caster",
    "music agent",
    "BBC announcer and television personality",
    "announcer for the Memphis Grizzlies",
    "announcer & television  announcer",
    "television announcer and producer",
    "television and radio announcer",
    "radio and television announcer",
    "radio announcer and newsreader",
    "television host and announcer",
    "announcer and commentator",
    "public address announcer",
    "public radio announcer",
    "television announcer",
    "continuity announcer",
    "radio announcer",
    "and announcer",
    "PA announcer",
    "announcer",
    "country disc jockey",
    "announcer and commentator",
    "commentator and television personality",
    "and television commentator",
    "news commentator and game show panelist",
    "television  commentator and presenter",
    "commentator and television presenter",
    "television presenter  commentator",
    "news reporter and commentator",
    "commentator with ITV",
    "commentator in New Orleans",
    "television commentator",
    "media commentator",
    "and commentator",
    "commentator and",
    "TV commentator",
    "commentator",
]
sports = [
    "professional football and member of the Pro Football Hall of Fame",
    "football manager with West Ham United F C and Ipswich Town F C",
    "former association football and rugby union international",
    "former association football and rugby union international",
    "youth team player with football team Queens Park Rangers",
    "football linebacker in the National Football League",
    "rugby league and rugby union international player",
    "Olympic bronze medal winning football goalkeeper",
    "football placekicker for University of Southern",
    "World Cup winning football manager and player",
    "president of the national football federation",
    "football running back for the Denver Broncos",
    "former first baseman for the Minnesota Twins",
    "University of Michigan football head coach",
    "Hall of Fame boxing trainer and HBO boxing",
    "Hall of Fame boxing trainer and HBO boxing",
    "former junior featherwight boxing champion",
    "football manager of Stockport County and",
    "Hall of Fame football referee and player",
    "Hall of Fame professional wrestling ring",
    "football and hurling manager and player",
    "chairman of Bradford City football club",
    "world boxing champion and hall of famer",
    "Olympic and World Cup football referee",
    "national football team assistant coach",
    "former coach of national football team",
    "snooker player and billiard hall owner",
    "professional in cricket and football",
    "football halfback and defensive back",
    "for the Lakers basketball team since",
    "former world champion snooker player",
    "college football and baseball coach",
    "football offensive lineman in the s",
    "football administrator and manager",
    "football manager and administrator",
    "Hall of Fame women tennis promoter",
    "former boxing heavyweight champion",
    "professional football placekicker",
    "baseball team owner and executive",
    "world heavyweight boxing champion",
    "World Lightweight boxing champion",
    "Lightweight world boxing champion",
    "world lightweight boxing champion",
    "Hall of Fame football team owner",
    "professional football goalkeeper",
    "Hall of Fame football team owner",
    "football goalkeeper and manager",
    "footwear and football executive",
    "basketball and football referee",
    "and professional wrestling ring",
    "college soccer and tennis coach",
    "international football manager",
    "football referee and executive",
    "Hall of Fame football referee",
    "football midfielder and coach",
    "rugby union and league player",
    "rugby union captain and coach",
    "Hall of Fame tennis executive",
    "boxing promoter and bookmaker",
    "football and wrestling coach",
    "football executive and coach",
    "football forward and manager",
    "football manager of Barnsley",
    "snooker and billiards player",
    "bantamweight boxing champion",
    "middleweight boxing champion",
    "football manager and player",
    "football and baseball coach",
    "college football head coach",
    "openly gay football referee",
    "professional wrestling ring",
    "table tennis world champion",
    "boxing manager and promoter",
    "Hall of Fame boxing trainer",
    "football club chairman and",
    "and football administrator",
    "football offensive lineman",
    "indoor football team owner",
    "indoor football team owner",
    "and rower and rowing coach",
    "boxing trainer and manager",
    "boxing manager and trainer",
    "football and bandy player",
    "football offensive tackle",
    "rugby union administrator",
    "arena football executive",
    "football assistant coach",
    "land speed record holder",
    "snooker player and coach",
    "tennis and hockey player",
    "boxing troupe impresario",
    "football club owner and",
    "football team owner and",
    "football and basketball",
    "gridiron footballplayer",
    "baseball team owner and",
    "football team owner and",
    "for the Minnesota Twins",
    "motorsport professional",
    "collegiate tennis coach",
    "Olympic boxing champion",
    "and football executive",
    "and football executive",
    "football administrator",
    "football club chairman",
    "AFL football executive",
    "football executive and",
    "football international",
    "former cricket captain",
    "executive of football",
    "rules football legend",
    "football running back",
    "football place kicker",
    "rugby union executive",
    "motorsport team owner",
    "world boxing champion",
    "Olympic rowing coach",
    "motorsport executive",
    "boxing administrator",
    "football goalkeeper",
    "football linebacker",
    "football team owner",
    "football club owner",
    "football head coach",
    "football midfielder",
    "baseball team owner",
    "football team owner",
    "rugby union referee",
    "motorsport promoter",
    "football executive",
    "player of football",
    "rugby union winger",
    "football official",
    "football chairman",
    "football director",
    "rugby union coach",
    "and tennis umpire",
    "college football",
    "football fan and",
    "football manager",
    "football referee",
    "football striker",
    "football forward",
    "rowing coach and",
    "rugby union lock",
    "boxing executive",
    "boxing cornerman",
    "boxing executive",
    "football mascot",
    "football umpire",
    "football safety",
    "cricket captain",
    "snooker referee",
    "rugby union and",
    "tennis champion",
    "boxing promoter",
    "boxing champion",
    "boxing official",
    "football agent",
    "footballplayer",
    "snooker player",
    "boxing trainer",
    "boxing referee",
    "boxing manager",
    "tennis umpire",
    "football and",
    "football fan",
    "play by play",
    "rowing coach",
    "tennis coach",
    "table tennis",
    "tennis agent",
    "boxing coach",
    "rugby union",
    "race track",
    "motorsport",
    "football",
    "MLB Mets",
    "snooker",
    "tennis",
    "boxing",
    "ESPN",
]
sciences = []

business_farming = [
    "public utility executive",
    "founder of Avis Rent a Car System and real estate developer",
    "real estate developer and father of Donald Trump",
    "gaming executive and real estate developer",
    "real estate developer regarded as the",
    "financier and real estate developer",
    "industrial real estate developer",
    "real estate developer and",
    "real estate developer",
    "oil tycoon",
    "management accountant",
    "bathroom furnishings"
    "investor and industrialist merged Studebaker and Worthington Corporation into Studebaker Worthington",
    "investor and water boiler manufacturer",
    "real estate investor and developer",
    "real estate executive and investor",
    "investor and hedge fund manager",
    "investor and stock trader",
    "leveraged buyout investor",
    "venture capital investor",
    "investor and executive",
    "retailer and investor",
    "real estate investor",
    "billionaire investor",
    "hedge fund investor",
    "property investor",
    "Arabian investor",
    "holding investor",
    "stock investor",
    "value investor",
    "investor and",
    "investor",
]
academia_humanities = [
    "stamp collector",
]
law_enf_military_operator = []
spiritual = [
    "bible",
]
social = []
crime = []
event_record_other = [
    "football pools winner",  # before sports
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [674]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [680]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 2min 31s
Wall time: 2min 31s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
4551,14,Rose Ouellette,", 93, Quebec actress, comedian, and theatre manager.",https://en.wikipedia.org/wiki/Rose_Ouellette,9,1996,September,,,,comedian,and theatre manager,,,,,,,,93.0,,Canada,,,2.302585,0,0,0,0,0,1,0,0,0,0,0,0,1
38198,19,Richard T. Drinnon,", 87, American historian.",https://en.wikipedia.org/wiki/Richard_T._Drinnon,3,2012,April,,,,,,,,,,,,,87.0,,United States of America,,,1.386294,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [681]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 16902 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [796]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [795]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [794]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "Senior Olympian" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [793]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [792]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "toy and board game inventor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [791]:
# Creating lists for each category
politics_govt_law = [
    "Princess of and Grand Duchess of",  # before sports
    "climate change denier",
    "Grand Duchess",
    "legal  Minister of Justice",
    "legal  SEC commissioner",
    "barrister and legal",
    "legal reformer",
    "legal adviser",
    "legal expert",
    "and legal",
    "legal",
    "intellectual freedom advocate",
    "member of the Vichy government",
    "Labour MP for Holborn and St Pancras South and opposition spokesman in the House of Lords",
    "Privy Council member and former minister and MP",
    "MP for Southend West and government minister",
    "MP and Speaker of the House of Commons",
    "Ulster Unionist MP for North Belfast",
    "Labour MP for Crewe and Nantwich",
    "Conservative MP for Petersfield",
    "MP and government minister",
    "Labour MP for Ealing",
    "National Alliance MP",
    "MP for Ottawa East",
    "MP for Langbaurgh",
    "MP for Broadgreen",
    "Conservative MP",
    "MP and minister",
    "independent MP",
    "Bloc MP",
    "and MP",
    "MP",
]

arts = [
    "classical violin prodigy and theremin performer",
    "violin virtuoso",
    "violin luthier",
    "violin maker",
    "violinist",
    "violin",
    "biographer of Laurel and Hardy",
    "magazine editor and biographer",
    "music producer and biographer",
    "music industry biographer",
    "humourist and biographer",
    "critic and biographer",
    "literary biographer",
    "and biographer",
    "biographer and",
    "biographer",
    "horticulturalist and Disney landscape designer",
    "horticulturalist and television host",
    "horticulturalist",
    "and first woman weathercaster in the",
    "caster for the Philadelphia Flyers",
    "language radio show host caster",
    "television and radio newscaster",
    "pioneering language newscaster",
    "radio and television caster",
    "caster and disc jockey",
    "and surf forecaster",
    "weather forecaster",
    "television caster",
    "caster and golf",
    "newscaster",
    "caster",
    "at WTSP Tampa Bay",
    "television weatherman",
    "weatherman",
    "co inventor of Trivial Pursuit",  # before sciences
    "toy and board game inventor",
    "inventor of the Barbie doll",
    "licensing agent and inventor of the action figure",
    "inventor of the smiley",
    "board game inventor",
    "boardgame inventor",
]
sports = [
    "chess international master and correspondence grandmaster",
    "correspondence chess grandmaster and chess theoretician",
    "chess Woman Grandmaster and International Arbiter",
    "International Grandmaster of chess",
    "correspondence chess grandmaster",
    "chess master and chess champion",
    "Hall of Fame chess grandmaster",
    "and chess Woman Grandmaster",
    "chess player and grandmaster",
    "chess grandmaster living in",
    "chess International Master",
    "Grandmaster chess player",
    "chess master and trainer",
    "chess player and trainer",
    "chess player and coach",
    "chess variant inventor",  # before sciences
    "era chess grandmaster",
    "grandmaster of chess",
    "chess grandmaster",
    "chess Grandmaster",
    "chess FIDE Master",
    "chess problemist",
    "chess player and",
    "and chess player",
    "chess organizer",
    "chess champion",
    "chess player",
    "chess master",
    "chessplayer",
    "mountaineer and explorer",
    "explorer and mountaineer",
    "and marine explorer",
    "underwater explorer",
    "Antarctic explorer",
    "and polar explorer",
    "Arctic explorer",
    "polar explorer",
    "explorer and",
    "and explorer",
    "explorer",
    "professional Go player",
    "dan Go player",
    "Go player",
    "tornado chaser",
    "Senior Olympian",
]
sciences = [
    "geologist at Columbia University and expert on climate change",
    "geologist and planetary science pioneer",
    "geologist and science administrator",
    "geologist and palaeontologist",
    "palaeontologist and geologist",
    "geologist and geomorphologist",
    "geologist and polar explorer",
    "fossil hunter and geologist",
    "volcanologist and geologist",
    "geologist and seismologist",
    "marine geologist and",
    "structural geologist",
    "planetary geologist",
    "geologist and paleo",
    "marine geologist",
    "hydrogeologist",
    "astrogeologist",
    "and geologist",
    "geologist and",
    "geologist",
    "statistician and demographer",
    "demographer and",
    "demographer",
    "logician in the analytic tradition",
    "mathematical logician",
    "logician",
    "metallurgist and crystallographer",
    "metallurgist and",
    "metallurgist",
    "meteorologist and weather",
    "meteorologist and inventor",
    "meteorologist and",
    "and meteorologist",
    "meteorologist",
    "inventor of the disposable hypodermic syringe and the tranquilizer gun",
    "inventor and a pioneer in mobile wireless communication",
    "inventor of disposable plastic endotracheal tube",
    "inventor of the acoustic suspension loudspeaker",
    "inventor of the battery powered smoke detector",
    "inventor of intermittent windshield wipers",
    "inventor of the Nautilus exercise machines",
    "medical device inventor and billionaire",
    "co inventor of the catalytic converter",
    "inventor of the modern hot air balloon",
    "electronic music inventor and pioneer",
    "inventor of the flight data recorder",
    "co inventor of the TV remote control",
    "inventor who invented the lava lamp",
    "inventor and technology executive",
    "co inventor of the nicotine patch",
    "aviation safety device inventor",
    "inventor of disposable nappies",
    "inventor of crash test dummies",
    "inventor and computer pioneer",
    "otolaryngologist and inventor",
    "aviation pioneer and inventor",
    "inventor of narrative therapy",
    "inventor of phototypesetting",
    "visual effects inventor and",
    "meteorologist and inventor",
    "inventor and cardiologist",
    "radiologist and inventor",
    "inventor of gaffer tape",
    "inventor of bubble gum",
    "director and inventor",
    "aeronautical inventor",
    "firearm inventor and",
    "doctor and inventor",
    "sailboard inventor",
    "pacemaker inventor",
    "and inventor",
    "inventor and",
    "inventor",
]

business_farming = [
    "mineral dealer",
    "oil executive and property developer",
    "Arabian oil executive",
    "oil executive",
    "industrialist merged Studebaker and Worthington Corporation into Studebaker Worthington",
    "electronics industrialist and co founder of Sony",
    "wife of industrialist Charles W Engelhard Jr",
    "industrialist and chief executive of Leyland",
    "industrialist and creator of Pernod Ricard",
    "industrialist and co founder of Bic",
    "financier and industrialist",
    "shipowner and industrialist",
    "billionaire industrialist",
    "electronics industrialist",
    "car industrialist",
    "and industrialist",
    "industrialist and",
    "industrialist",
    "inventor of Nissin instant ramen noodles including the Cup Noodle",  # before sciences
    "inventor of the Chipwich ice cream sandwich",
    "inventor of Rice a Roni",
    "inventor of SpaghettiOs",
]
academia_humanities = [
    "lexicographer and philologist",
    "etymologist and philologist",
    "philologist of language",
    "classical philologist",
    "philologist",
    "cultural geographer",
    "geographer and",
    "geographer",
    "ethnologist and museum curator",
    "ethnologist and epigrapher",
    "ethnologist and",
    "and ethnologist",
    "ethnologist",
    "map librarian at the Public Library and the Library of Congress",
    "librarian and library science pioneer",
    "cataloging theorist and librarian",
    "librarian and book collector",
    "librarian and archivist",
    "Papua New librarian and",
    "librarian and curator",
    "children librarian",
    "picture librarian",
    "medical librarian",
    "music librarian",
    "librarian and",
    "librarian",
]
law_enf_military_operator = [
    "nazi cryptographer",
    "test pilot noted for his work with Spitfire and Lancaster aircraft",  # before arts
]
spiritual = []
social = []
crime = [
    "illegal lottery operator",  # before politics_govt_law
]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [797]:
# Combining separate lists into one dictionary
known_for_dict = {
    "crime": crime,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "sciences": sciences,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [798]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 2min 27s
Wall time: 2min 28s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
7901,17,Sir James Lighthill,", 74, British mathematician.",https://en.wikipedia.org/wiki/James_Lighthill,17,1998,July,,,,,,,,,,,,,74.0,,United Kingdom of Great Britain and Northern Ireland,,,2.890372,1,0,0,0,0,0,0,0,0,0,0,0,1
10043,28,Ralph Crosthwaite,", 63, American basketball player.",https://en.wikipedia.org/wiki/Ralph_Crosthwaite,6,1999,October,,,,,,,,,,,,,63.0,,United States of America,,,1.94591,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [799]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 16027 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [862]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [860]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [861]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "political" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [859]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [858]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "political critic"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [855]:
# Creating lists for each category
politics_govt_law = [
    "political campaign director for Barack Obama in Nevada",
    "widow of religious political leader Ruhollah Khomeini",
    "labor  of political science at the University of",
    "environmentalist and political aide",
    "political adviser to Lyndon Johnson",
    "dissident and political prisoner",
    "Pulitzer Prize winning political",
    "political analyst and strategist",
    "political and trade union leader",
    "economic and political analyst",
    "political adviser and lobbyist",
    "social and political theorist",
    "political and constitutional",
    "serial political candidate",
    "local government executive",
    "communist political leader",
    "liberal feminist political",
    "political campaign staffer",
    "Communist political leader",
    "and political fundraiser",
    "and political campaigner",
    "political secretary and",
    "communist and political",
    "and political reformer",
    "post Marxist political",
    "and political prisoner",
    "neo fascist political",
    "political advisor and",
    "and political advisor",
    "political  strategist",
    "of political science",
    "political leader and",
    "political fundraiser",
    "marja' and political",
    "political campaigner",
    "and political leader",
    "political researcher",
    "political matriarch",
    "political financier",
    "political commissar",
    "political executive",
    "political operative",
    "political candidate",
    "political organiser",
    "moral and political",
    "political celebrity",
    "political dissident",
    "political prankster",
    "political organizer",
    "political  feminist",
    "political lobbyist",
    "political prisoner",
    "Comanche political",
    "political theorist",
    "political adviser",
    "political refugee",
    "political analyst",
    "political science",
    "Marxist political",
    "political advisor",
    "political critic",
    "political leader",
    "political exile",
    "political aide",
    "and political",
    "political and",
    "geopolitical",
    "political",
]

arts = [
    "television presenter and media relations executive",
    "ventriloquist and children television presenter",
    "television presenter and internet personality",
    "television presenter and internet celebrity",
    "disc jockey and radio television presenter",
    "radio disc jockey and television presenter",
    "pioneer gardening television presenter",
    "disc jockey and television presenter",
    "television presenter and entertainer",
    "steeplejack and television presenter",
    "television presenter and radio host",
    "anchorman and television presenter",
    "television presenter and director",
    "television presenter and producer",
    "radio DJ and television presenter",
    "and children television presenter",
    "radio and television presenter",
    "children television presenter",
    "and television presenter",
    "television presenter and",
    "television presenter",
    "rose grower",  # before sports
]
sports = [
    "mountaineer and the first person to reach summit of Mount Everest",
    "mountaineer and Seven Summits record holder",
    "ski executive and mountaineer",
    "rock climber and mountaineer",
    "mountaineer and alpinist",
    "climber and mountaineer",
    "mountaineer and guide",
    "sherpa mountaineer",
    "mountaineer legend",
    "ski mountaineer",
    "and mountaineer",
    "mountaineer and",
    "mountaineer",
    "rower and second oldest national Olympic competitor",
    "Olympic javelin thrower and world record holder",
    "Olympic bronze medal winning discus thrower",
    "Olympic discus thrower and handball player",
    "javelin thrower and Olympic gold medalist",
    "rower who competed in the Summer Olympics",
    "Olympic silver medal winning rower",
    "hammer thrower and weight thrower",
    "Olympic hammer and discus thrower",
    "Olympic gold medal winning rower",
    "Olympic champion javelin thrower",
    "four time Olympic hammer thrower",
    "shot putter and javelin thrower",
    "shot putter and discus thrower",
    "Olympic silver medallist rower",
    "Hall of Fame javelin thrower",
    "javelin thrower and olympian",
    "hammer thrower and Olympian",
    "discus thrower and Olympian",
    "ocean rower and adventurer",
    "rower and Olympic champion",
    "Olympic javelin thrower",
    "rower and fold medalist",
    "Olympic discus thrower",
    "Olympic hammer thrower",
    "Olympic champion rower",
    "rower and Olympian",
    "Olympic rower and",
    "competition rower",
    "Olympic rower and",
    "sailor and rower",
    "javelin thrower",
    "hammer thrower",
    "discus thrower",
    "tobacco grower",
    "Olympic rower",
    "rower and",
    "rower",
    "sprinter and Olympic bronze medalist",
    "sprinter and middle distance runner",
    "Olympic sprinter and long jumper",
    "Olympic medal winning sprinter",
    "Paralympic champion sprinter",
    "Olympic sprinter and hurdler",
    "Olympic champion sprinter",
    "sprinter and long jumper",
    "sprinter and Olympian",
    "Hall of Fame sprinter",
    "sprinter and hurdler",
    "Olympic sprinter",
    "sprinter and",
    "sprinter",
]
sciences = [
    'ophthalmologist known as "the father of retinal surgery" and',
    "surgical ophthalmologist",
    "ophthalmologist and",
    "and ophthalmologist",
    "ophthalmologist",
]

business_farming = [
    "restaurateur credited with inventing sisig",
    "restaurateur and resort executive",
    "barbecue restaurateur",
    "restaurateur and",
    "restaurateur",
]
academia_humanities = []
law_enf_military_operator = []
spiritual = [
    "self styled spiritual medium and",
    "spiritual medium",
    "psychic medium",
    "medium",
]
social = []
crime = [
    "Hundreds accused him of sexual abuse the year after his death",
]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

In [856]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [857]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [863]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 39s
Wall time: 1min 39s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
52265,3,M. Stanton Evans,", 80, American journalist, author and educator, pancreatic cancer.",https://en.wikipedia.org/wiki/M._Stanton_Evans,45,2015,March,,,,author and educator,pancreatic cancer,,,,,,,,80.0,,United States of America,,,3.828641,0,0,0,0,0,1,0,0,0,0,0,0,1
87166,28,Paul J. Crutzen,", 87, Dutch atmospheric chemist, Nobel laureate .",https://en.wikipedia.org/wiki/Paul_J._Crutzen,43,2021,January,,,,Nobel laureate,,,,,,,,,87.0,,Netherlands,,1995.0,3.78419,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [864]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 15338 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1008]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1007]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [1006]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "Unabomber victim" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1005]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1004]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "geneticist and theistic evolutionist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1002]:
# Creating lists for each category
politics_govt_law = [
    "peace protester",
    "peacemaker",
    "dissident and prisoner of conscience",
    "FARC dissident and",
    "dissident republican",
    "confined dissident",
    "student dissident",
    "and dissident",
    "dissident and",
    "dissident",
    "tourism official",
    "public policy adviser",
    "public policy",
    "economics expert",
    "feminist and human rights campaigner",
    "feminist and civil rights advocate",
    "government researcher and feminist",
    "labor leader and feminist",
    "communist and feminist",
    "lesbian feminist",
    "Chicana feminist",
    "feminist and sex",
    "radical feminist",
    "pioneer feminist",
    "anarcho feminist",
    "ecofeminist",
    "ant feminist",
    "feminist and",
    "and feminist",
    "feminist",
    "aide de camp to Queen Elizabeth II",
    "nuclear power advocate",
    "Chief of Protocol of the",
]

arts = [
    "principal dancer with the Alvin Ailey Dance Theater and on Broadway",
    "dancer and reality television contestant",
    "acrobatic dancer and producer impresario",
    "dancer and widow of Yehudi Menuhin",
    "Emmy Award winning jazz dancer",
    "dancer and Tony Award winning",
    "contact improvisation dancer",
    "dancer and television host",
    "dancer and talent manager",
    "expressionist dancer and",
    "modern dance  and dancer",
    "dancer and entertainer",
    "Polynesian dancer and",
    "vaudeville tap dancer",
    "television dancer and",
    "Bharatanatyam dancer",
    "flamenco dancer and",
    "ballroom dancer and",
    "trainer and dancer",
    "transgender dancer",
    "Cotton Club dancer",
    "modern dancer and",
    "classical dancer",
    "Kathakali dancer",
    "burlesque dancer",
    "lindy hop dancer",
    "tango dancer and",
    "Kuchipudi dancer",
    "principal dancer",
    "ballroom dancer",
    "flamenco dancer",
    "flatfoot dancer",
    "Balinese dancer",
    "oriental dancer",
    "tap dancer and",
    "topless dancer",
    "ballet  dancer",
    "Kathak dancer",
    "modern dancer",
    "exotic dancer",
    "Odissi dancer",
    "butoh dancer",
    "belly dancer",
    "hula dancer",
    "folk dancer",
    "dancer and",
    "and dancer",
    "tap dancer",
    "dancer and",
    "dancer",
    "and public relations executive",
    "public relations executive",
    "show business publicist and alleged",  # before business
    "cosmetologist",
    "former head of the Horticultural Society",
    "road manager for Outkast",
    "calligrapher",
]
sports = [
    "ice dancer",  # before arts
    "Olympic Greco wrestler and candidate for National Olympic Committee president",
    "World Championship Wrestling wrestler and",
    "former World Wrestling Entertainment wrestler",
    "wrestler who competed in the Summer Olympics",
    "EstonianGreco wrestler and Olympic medalist",
    "Olympic silver medal winning wrestler",
    "Olympic gold medal winning wrestler",
    "Olympic medalist freestyle wrestler",
    "pro wrestler and wrestling manager",
    "wrestler and Olympic gold medalist",
    "freestyle wrestler of Avar descent",
    "Olympic silver medallist wrestler",
    "Olympic silver medalist wrestler",
    "light heavyweight Greco wrestler",
    "sumo wrestler and record holder",
    "heavyweight freestyle wrestler",
    "featherweight Greco wrestler",
    "wrestler and wrestling coach",
    "four time Olympic wrestler",
    "Olympic champion wrestler",
    "sumo wrestler and coach",
    "NFL player and wrestler",
    "Olympic Greco wrestler",
    "wrestler and wrestling",
    "wrestler and Olympian",
    "Hall of Fame wrestler",
    "professional\xa0wrestler",
    "wrestler and trainer",
    "traditional wrestler",
    "heavyweight wrestler",
    "lucha libre wrestler",
    "wrestler and judoka",
    "freestyle wrestler",
    "wrestler known as",
    "Olympic wrestler",
    "midget wrestler",
    "female wrestler",
    "Greco wrestler",
    "sambo wrestler",
    "sumo wrestler",
    "pro wrestler",
    "WWF wrestler",
    "wrestler and",
    "wrestler",
    "yachtsman and Olympic gold medal winner",
    "yachtsman and Olympian",
    "Olympic yachtsman",
    "and yachtsman",
    "yachtsman and",
    "yachtsman",
    "soccer coach and first head coach of the women national team",
    "soccer player with the Minnesota Kicks and Kaizer Chiefs",
    "assistant soccer coach of the men national team",
    "Hall of Fame soccer player and manager",
    "CEO of soccer club Shelbourne F C",
    "soccer coach and World Cup winner",
    "Hall of Fame soccer executive",
    "Hall of Fame soccer manager",
    "Hall of Fame soccer player",
    "soccer and lacrosse player",
    "soccer player and manager",
    "soccer player and coach",
    "soccer coach and player",
    "Olympic soccer player",
    "soccer administrator",
    "college soccer coach",
    "soccer goalkeeper",
    "soccer executive",
    "soccer trainer",
    "soccer player",
    "soccer coach",
    "soccer",
]
sciences = [
    "statistician and medical researcher",
    "statistician and econometrician",
    "statistician and geneticist",
    "mathematical statistician",
    "social statistician",
    "biostatistician",
    "statistician and",
    "statistician",
    "mathematical ecologist and population geneticist",
    "ornithologist and behavioral geneticist",
    "geneticist and DNA research pioneer",
    "medical researcher and geneticist",
    "agronomist and plant geneticist",
    "geneticist and plant breeder",
    "geneticist and agriculturist",
    "grape breeder and geneticist",
    "geneticist and wheat breeder",
    "hematologist and geneticist",
    "immunologist and geneticist",
    "pediatrician and geneticist",
    "geneticist and pathologist",
    "geneticist and virologist",
    "geneticist and agronomist",
    "evolutionary geneticist",
    "population geneticist",
    "behaviour geneticist",
    "behavior geneticist",
    "plant geneticist",
    "immunogeneticist",
    "cytogeneticist",
    "geneticist and",
    "geneticist",
]

business_farming = [
    "marketing executive at Pepsi who shunned racial stereotypes in advertising",
    "business consultancy and public relations executive",
    "catering and business support executive",
    "advertising executive and retailer",
    "business consultant and management",
    "Hall of Fame advertising executive",
    "business manager and accountant",
    "businesswoman and richest woman",
    "small businesswoman and farmer",
    "brewing advertising executive",
    "businesswoman and stockbroker",
    "businesswoman and billionaire",
    "glovemaker and businesswoman",
    "founder heli skiing business",
    "heiress and businesswoman",
    "realtor and businesswoman",
    "advertising executive and",
    "advertising professional",
    "and advertising creative",
    "and business consultant",
    "cosmetics businesswoman",
    "railroad businesswoman",
    "Jewish businessperson",
    "advertising executive",
    "business oligarch and",
    "and business tycoon",
    "advertising pioneer",
    "business consultant",
    "and business leader",
    "advertising tycoon",
    "and businessperson",
    "business agent and",
    "businessperson and",
    "businesswoman and",
    "and businesswoman",
    "business oligarch",
    "business magnate",
    "business analyst",
    "business pioneer",
    "business tycoon",
    "advertising and",
    "businessperson",
    "business owner",
    "businesswoman",
    "business man",
    "business and",
    "advertising",
    "business",
]


academia_humanities = [
    "crossword compiler",
    "Tatar folklorist",
    "and folklorist",
    "folklorist",
    "social ethicist and",
    "social ethicist",
    "and bioethicist",
    "bioethicist",
    "ethicist",
    "sectologist",
    "theorist and researcher",
]
law_enf_military_operator = [
    "naval intelligence officer and commando",
    "naval seaman and",
    "and naval officer",
    "naval officer and",
    "naval commander",
    "naval machinist",
    "naval officer",
    "naval aviator",
    "naval pioneer",
    "and naval",
    "naval",
]
spiritual = [
    "priest of the Church of and later of the Antiochian Orthodox Church",
    "Episcopal priest and founder of Interfaith Center of",
    "Jesuit priest in the Society of Jesus",
    "Episcopalian priest and church",
    "priest of the Catholic Church",
    "Jesuit priest and liturgist",
    "Lutheran priest and healer",
    "priest and Catholic Bishop",
    "American]] Jesuit priest",
    "Catholic Jesuit priest",
    "and Church of priest",
    "Episcopalian priest",
    "Catholic CMI priest",
    "transgender priest",
    "Evangelical priest",
    "Franciscan priest",
    "Jesuit priest and",
    "priest and rector",
    "Episcopal priest",
    "Santerían priest",
    "Church of priest",
    "Orthodox priest",
    "Lutheran priest",
    "catholic priest",
    "priest and monk",
    "Sōtō Zen priest",
    "Jesuit priest",
    "Wiccan priest",
    "Taoist priest",
    "Marist priest",
    "priest and",
    "and priest",
    "archpriest",
    "priest",
    "thealogian",
    "charismatic religious leader",
    "Protestant religious leader",
    "New Age religious leader",
    "Hindu religious leader",
    "Ibadi religious leader",
    "Dalit religious leader",
    "Jain religious leader",
    "Shia religious leader",
    "religious leader and",
    "religious leader",
    "theistic evolutionist",
]
social = []
crime = [
    "pedophile",
    "and mob associate",
    "mob associate",
    "murderer and drug trafficker",
    "drug trafficker and smuggler",
    "drug trafficker",
]
event_record_other = [
    "concentration camp survivor",
    "murder victim who was shot to death in by former acquaintance and gangster Tan Chor Jin",
    "and murder victim whose killing was documented in the movie: Dear Zachary",
    "teenager and murder victim",
    "girl and murder victim",
    "rape and murder victim",
    "teenage murder victim",
    "torture murder victim",
    "child murder victim",
    "and murder victim",
    "murder victim",
    "supercentenarian and the oldest person ever documented in history",
    "supercentenarian who was oldest person",
    "claimant supercentenarian",
    "and supercentenarian",
    "supercentenarian and",
    "supercentenarian",
    "Unabomber victim",
]
other_species = []

<IPython.core.display.Javascript object>

In [961]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1003]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "sports": sports,
    "arts": arts,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [1009]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 3min 12s
Wall time: 3min 13s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
24548,15,Frank Morgan,", 73, American saxophonist.",https://en.wikipedia.org/wiki/Frank_Morgan_(musician),25,2007,December,,,,,,,,,,,,,73.0,,United States of America,,,3.258097,0,0,0,0,0,1,0,0,0,0,0,0,1
85163,9,Bruno Barbey,", 79, Moroccan-born French photographer.",https://en.wikipedia.org/wiki/Bruno_Barbey,5,2020,November,,,,,,,,,,,,,79.0,,Morocco,France,,1.791759,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [1010]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 14210 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1115]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1116]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [1113]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "energy and basketball" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1112]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1108]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "executive with"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
23610,29,Raymond E. Douglas,", 58, American executive with , who helped add color to its pages, pulmonary embolism.",https://en.wikipedia.org/wiki/Raymond_E._Douglas,3,2007,June,,,executive with,who helped add color to its pages,pulmonary embolism,,,,,,,,58.0,,United States of America,,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1127]:
# Creating lists for each category
politics_govt_law = [
    "former executive secretary of the Student Nonviolent Coordinating Committee",  # before business_farming
    "member of the Legislative Council and the PLO executive committee",
    "chief executive of Docklands Development Corporation",
    "Blackfeet tribal executive and social advocate",
    "and medical research executive",
    "involved in Profumo affair",
    "and women health advocate",
    "Chippewa tribal executive",
    "Puyallup tribal executive",
    "labor union executive",
    "women health advocate",
    "government executive",
    "and cabinet minister",
    "law firm executive",
    "union executive",
    "turned opponent",
    "refugee",
]


arts = [
    "road manager for The Beatles and executive of Apple Corps",
    "model and sister of supermodel Angela Lindvall",
    "cable network executive and talent agent",
    "record company executive and producer",
    "broadcasting and live event executive",
    "radio and cable television executive",
    "music executive and television host",
    "glamour model and interior designer",
    "magazine editor and media executive",
    "model and beauty pageant contestant",
    "music executive and talent manager",
    "model and reality television star",
    "talent agent and theatre producer",
    "television executive and producer",
    "model and social media celebrity",
    "model and television personality",
    "entertainment industry executive",
    "animation producer and executive",
    "disc jockey and record executive",
    "music critic and radio executive",
    "first woman television executive",
    "model and beauty pageant winner",
    "music executive and club owner",
    "television and radio executive",
    "newspaper editor and executive",  # before business_farming
    "adult entertainment executive",
    "radio executive and presenter",
    "public broadcasting executive",
    "television executive producer",
    "newspaper executive and press",
    "music promoter and executive",
    "network television executive",
    "performance rights executive",
    "model and Playboy centerfold",
    "radio broadcasting executive",
    "recording industry executive",
    "bandleader and talent agent",
    "impresario and talent agent",
    "and broadcasting executive",
    "cable television executive",
    "photographic fashion model",
    "radio production executive",
    "model and Playboy Playmate",
    "book publishing executive",
    "television news executive",
    "magazine executive editor",
    "movie marketing executive",
    "talent agent and producer",
    "record industry executive",
    "record company executive",
    "graphic design executive",
    "Motion picture executive",
    "talent agent and manager",
    "music industry executive",
    "fashion editor and model",
    "model and pageant winner",
    "newspaper executive and",
    "entertainment executive",
    "country music executive",
    "T V and radio executive",
    "movie theater executive",
    "movie studio executive",
    "model agency executive",
    "record label executive",
    "model for Calvin Klein",
    "broadcasting executive",
    "music label executive",
    "television executive",
    "publishing executive",
    "newspaper executive",
    "music executive and",
    "DC Comics executive",
    "broadcast executive",
    "cultural executive",
    "male fashion model",
    "magazine executive",
    "model and showgirl",
    "executive producer",
    "theater executive",
    "model and blogger",
    "railway modeller",
    "studio executive",
    "Motown executive",
    "executive at MGM",
    "record executive",
    "music  executive",
    "radio executive",
    "media executive",
    "music executive",
    "press executive",
    "movie executive",
    "executive with",
    "news executive",
    "anorexic model",
    "model of the s",
    "TV  and model",
    "smoking model",
    "glamour model",
    "fashion model",
    "TV executive",
    "talent agent",
    "male model",
    "supermodel",
    "model and",
    "and model",
    "modeler",
    "model",
]


sports = [
    "executive for the Football League BC Lions",  # before business_farming
    "National Basketball Association executive",
    "deep sea diver and marine treasure hunter",
    "baseball executive and spokesperson",
    "Olympic bronze medal winning diver",
    "sport executive and administrator",
    "wrestling executive and promoter",
    "Olympic gold medal winning diver",
    "fitness trainer and bodybuilder",
    "minor league baseball executive",
    "bodybuilder and fitness pioneer",
    "skydiver and skysurfing pioneer",
    "world record setting free diver",
    "ice hockey coach and executive",
    "ice hockey executive and coach",
    "collegiate athletic executive",
    "Olympic silver medalist diver",
    "Olympic archer and executive",
    "bodybuilder and weightlifter",
    "diver and Olympic medallist",
    "automobile racing executive",
    "powerlifter and bodybuilder",
    "polo player and executive",
    "professional skateboarder",
    "professional bodybuilder",
    "skydiver and base jumper",
    "recreational scuba diver",
    "CFL coach and executive",
    "motor racing executive",
    "energy and basketball",
    "basketball executive",
    "ice hockey executive",
    "and underwater diver",
    "racetrack operator",
    "sporting executive",
    "football executive",
    "diver and Olympian",
    "Hall of Fame diver",
    "horse stable owner",
    "baseball executive",
    "Baseball executive",
    "hockey team owner",
    "and fighting bull",
    "Olympic diver and",
    "gymnast and diver",
    "sports executive",
    "deep water diver",
    "underwater diver",
    "skateboarder and",
    "pro skateboarder",
    "rugby executive",
    "and scuba diver",
    "sport executive",
    "Olympic diver[]",
    "bodybuilder and",
    "fighting bull",
    "Olympic diver",
    "skateboarder",
    "scuba diver",
    "bullfighter",
    "bodybuilder",
    "free diver",
    "cave diver",
    "skydiver",
    "diver",
]


sciences = [
    "founding executive director and president of the Aquarium",  # before business_farming  # before academia_humanities
    "rocketry pioneer and NASA executive",
    "electrician and model rocket expert",  # before arts
    "executive at SRI International",
    "nurse and healthcare executive",
    "midwife and nurse",
    "nurse and midwife",
    "midwife and",
    "midwife",
]

business_farming = [
    "financial executive ; first woman member of the Stock Exchange",
    "Royal Bank of executive questioned over Enron collapse",
    "marketing executive credited with developing Doritos",
    "communications executive and property developer",
    "billionaire real estate and finance executive",
    "automotive retail and electronics executive",
    "real estate and financial sector executive",
    "Arabian conglomerate and banking executive",
    "transportation and agricultural executive",
    "billionaire aerospace defense executive",
    "pharmaceutical and technology executive",
    "retail furniture home design executive",
    "securities and healthcare executive",
    "automobile manufacturing executive",
    "billionaire construction executive",
    "typographer and software executive",
    "automotive manufacturing executive",
    "financier and investment executive",
    "billionaire health care executive",
    "information technology executive",
    "food manufacturing executive and",
    "investment management executive",
    "managerial consulting executive",
    "lumber manufacturing executive",
    "consumer electronics executive",
    "liquor executive and distiller",
    "software and banking executive",
    "automobile industry executive",
    "marketing and sales executive",
    "billionaire grocery executive",
    "restaurant industry executive",
    "container transport executive",
    "timeshare exchange executive",
    "telecommunications executive",
    "transportation executive and",
    "railway worker and executive",
    "building materials executive",
    "medical technology executive",
    "food and beverage executive",
    "beverage and food executive",
    "building material executive",
    "steel and airline executive",
    "teaching hospital executive",
    "college athletics executive",
    "consumer products executive",
    "commodity trading executive",
    "insurance company executive",
    "and broadcasting executive",
    "natural resource executive",
    "investment chief executive",
    "aircraft leasing executive",
    "home furnishing executive",
    "oil and transit executive",
    "bank and mining executive",
    "food processing executive",
    "diamond and gem executive",
    "real estate executive and",
    "and mail order executive",
    "civil aviation executive",
    "transportation executive",
    "communications executive",
    "pharmaceutical executive",
    "farm equipment executive",
    "accountant and executive",
    "office supply executive",
    "financier and executive",
    "manufacturing executive",
    "biotechnology executive",
    "chief executive officer",
    "airline chief executive",
    "and insurance executive",
    "travel agency executive",
    "hair products executive",
    "food industry executive",
    "pest control executive",
    "conglomerate executive",
    "construction executive",
    "web services executive",
    "metallurgic executive",
    "real estate executive",
    "oil company executive",
    "aeronautics executive",
    "electronics executive",
    "tobacco executive and",
    "electronic executive",
    "investment executive",
    "technology executive",
    "industrial executive",
    "automotive executive",
    "e commerce executive",
    "automobile executive",
    "video game executive",
    "theme park executive",
    "petroleum executive",
    "corporate executive",
    "aerospace executive",
    "fast food executive",
    "insurance executive",
    "transport executive",
    "cosmetics executive",
    "furniture executive",
    "brokerage executive",
    "financial executive",
    "marketing executive",
    "utilities executive",
    "gambling executive",
    "railroad executive",
    "shipping executive",
    "trucking executive",
    "aircraft executive",
    "catering executive",
    "aviation executive",
    "computer executive",
    "beverage executive",
    "finance executive",
    "airline executive",
    "telecom executive",
    "brewing executive",
    "Arabian executive",
    "company executive",
    "textile executive",
    "fashion executive",
    "banking executive",
    "energy executive",
    "lumber executive",
    "retail executive",
    "coffee executive",
    "realty executive",
    "health executive",
    "mining executive",
    "casino executive",
    "gaming executive",
    "travel executive",
    "dairy executive",
    "chief executive",
    "glass executive",
    "steel executive",
    "spice executive",
    "sugar executive",
    "hotel executive",
    "executive coach",
    "audio executive",
    "cattle rancher",
    "rail executive",
    "bank executive",
    "coal executive",
    "food executive",
    "taxi executive",
    "RCA executive",
    "toy executive",
    "and rancher",
    "rancher and",
    "executive",
    "rancher",
]

academia_humanities = [
    "chief executive of the Museum of Te Papa Tongarewa",  # before business_farming
    "chief executive of the Philadelphia Museum of Art",
    "university executive",
    "executive director",
    "museum executive",
]
law_enf_military_operator = [
    "first black Navy diver",  # before sports
    "Air Force scuba diver",
    "turned bounty hunter",
    "prison executive",  # before business_farming
    "sea captain",
]
spiritual = [
    "Pentecostal preacher and televangelist",
    "Pentecostal televangelist",
    "televangelist",
]
social = [
    "executive director of Casa Alianza",  # before business_farming  # before academia_humanities
    "charity executive and campaigner",
    "rescue foundation executive",
    "non profit executive",
    "charity executive",
]
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

In [1111]:
# Hard-coding info_2 for entries with non-specific 'executive' value if not business_farming
index = df[df["link"] == "https://en.wikipedia.org/wiki/Ned_Tanen"].index
df.loc[index, "info_2"] = "media executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Robert_Pauley"].index
df.loc[index, "info_2"] = "media executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Douglas_B._Leeds"].index
df.loc[index, "info_2"] = "media executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Robert_W._Edgar"].index
df.loc[index, "info_2"] = ""  # already in politics_govt_law

index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Jo%C3%A3o_N%C3%ADlson_Zunino"
].index
df.loc[index, "info_2"] = "football executive"  # football executive added to dict

index = df[df["link"] == "https://en.wikipedia.org/wiki/Victor_Watson"].index
df.loc[index, "info_2"] = "publishing executive"

index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Matthew_Young_(civil_servant)"
].index
df.loc[index, "info_2"] = "publishing executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Steve_Beck_(chairman)"].index
df.loc[index, "info_2"] = "football executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Beppo_Mauhart"].index
df.loc[index, "info_2"] = "football executive executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Janet_Lewis-Jones"].index
df.loc[index, "info_2"] = "media executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Avie_Bennett"].index
df.loc[index, "info_2"] = "publishing executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Andrew_Paulson"].index
df.loc[index, "info_2"] = "media executive"

index = df[df["link"] == "https://en.wikipedia.org/wiki/A._Daniel_O%27Neal"].index
df.loc[
    index, "info_2"
] = "government executive executive"  # government executive added to dict

index = df[df["link"] == "https://en.wikipedia.org/wiki/Derek_Keys"].index
df.loc[index, "info_2"] = "government executive executive"

index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Thomas_J._Moran_(businessman)"
].index
df.loc[
    index, "info_2"
] = "university executive executive"  # university executive added to dict

index = df[df["link"] == "https://en.wikipedia.org/wiki/Kalevi_Tuominen"].index
df.loc[index, "info_2"] = "sports executive executive"  # sports executive added to dict

index = df[df["link"] == "https://en.wikipedia.org/wiki/Pierre_Viot"].index
df.loc[
    index, "info_2"
] = "government executive cultural executive"  # cultural executive added to dict

<IPython.core.display.Javascript object>

In [1128]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1129]:
# Combining separate lists into one dictionary
known_for_dict = {
    "sciences": sciences,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "business_farming": business_farming,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [1130]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 3min 34s
Wall time: 3min 34s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
32542,20,Jennifer Rardin,", 45, American author, known for the Jaz Parks series of fantasy novels.",https://en.wikipedia.org/wiki/Jennifer_Rardin,16,2010,September,,,,known for the Jaz Parks series of fantasy novels,,,,,,,,,45.0,,United States of America,,,2.833213,0,0,0,0,0,1,0,0,0,0,0,0,1
51959,8,Sir David Watson,", 65, British academic and educationalist .",https://en.wikipedia.org/wiki/David_Watson_(academic),14,2015,February,University of Oxford,,,,,,,,,,,,65.0,,United Kingdom of Great Britain and Northern Ireland,,University of Oxford,2.70805,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [1131]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 13383 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1230]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1229]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [1228]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "murderer" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1227]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1226]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "suspected murderer and victim"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1223]:
# Creating lists for each category
politics_govt_law = [
    "environmentalist and co founder of Greenpeace International",
    "trade union leader and human rights campaigner",
    "of the Court of Appeals for the First Circuit",
    "environmentalist and senior Cabinet adviser",
    "social reformer and environmentalist",
    "communist and trade union leader",
    "and environmental economics",
    "and environment campaigner",
    "and environmental pioneer",
    "environmental campaigner",
    "and environmental leader",
    "labor union leader and",
    "and trade union leader",
    "and environmentalist",
    "environmentalist and",
    "trade union leader",
    "labor union leader",
    "environmentalist",
    "environmental",
]
arts = [
    "jazz harmonica player often credited as the world first",
    "Grammy Award winning jazz fusion keyboardist",
    "jazz tenor saxophone and clarinet player",
    "Latin jazz percussionist and bandleader",
    "crossover jazz and bossa nova flutist",
    "jazz trumpeter and flugelhorn player",
    "rock and jazz bassoonist and oboist",
    "speaking Creole Zydeco accordionist",
    "swing jazz bandleader and trumpeter",
    "jazz vibraphone and marimba player",
    "graphic and environmental designer",  # before politics_govt_law
    "jazz trumpet player and bandleader",
    "jazz clarinet and saxophone player",
    "jazz trumpeter and big band leader",
    "jazz trumpeter and flugelhornist",
    "jazz percussionist and vocalist",
    "Southern jazz radio disc jockey",
    "jazz clarinetist and bandleader",
    "jazz trombonist and band leader",
    "swing and bebop jazz trumpeter",
    "tenor saxophone jazz performer",
    "jazz trumpeter and chairman of",
    "jazz baritone saxophone player",
    "jazz bandleader and trombonist",
    "jazz trumpeter and band leader",
    "jazz trumpeter and bandleader",
    "jazz bassist and sousaphonist",
    "jazz trombonist and arranger",
    "vocalist and jazz bandleader",
    "jazz trumpeter and cornetist",
    "jazz arranger and bandleader",
    "jazz trombonist and vocalist",
    "big band era jazz trumpeter",
    "jazz bassist and bandleader",
    "accordionist and bandleader",
    "jazz multi instrumentalist",
    "swing music jazz trumpeter",
    "New Orleans jazz trumpeter",
    "modern jazz double bassist",
    "West coast jazz trumpeter",
    "Latin jazz percussionist",
    "jazz and blues organist",
    "jazz and blues vocalist",
    "jazz saxophone player",
    "jazz valve trombonist",
    "jazz clarinet player",
    "concert accordionist",
    "jazz nightclub owner",
    "jazz upright bassist",
    "accordion player and",
    "jazz big band leader",
    "disc jockey and jazz",
    "jazz fusion bassist",
    "jazz bagpipe player",
    "jazz double bassist",
    "jazz trumpet player",
    "creole accordionist",
    "jazz percussionist",
    "free jazz bassist",
    "jazz clarinettist",
    "jazz vibraphonist",
    "jazz accordionist",
    "jazz tuba player",
    "jazz clarinetist",
    "jazz keyboardist",
    "accordion player",
    "jazz club owner",
    "jazz bandleader",
    "jazz trombonist",
    "jazz trumpeter",
    "jazz cornetist",
    "jazz presenter",
    "jazz promoter",
    "jazz organist",
    "jazz vocalist",
    "jazz arranger",
    "jazz producer",
    "jazz flautist",
    "accordionist",
    "jazz hornist",
    "jazz bassist",
    "jazz flutist",
    "jazz pioneer",
    "jazz critic",
    "jazz expert",
    "jazz music",
    "jazz dance",
    "jazz  DJ",
    "jazz and",
    "jazz",
]


sports = [
    "Olympic bronze medalist in fencing and oldest living former Olympian",
    "ice hockey goaltending coach for Winter Olympics gold medal team",
    "captain of the Nordic skiing team at the Winter Olympics",
    "long distance runner who competed in the Summer Olympics",
    "Olympic trap shooter and Commonwealth Games champion",
    "women volleyball team captain and Olympic competitor",
    "Olympic bronze and silver medallist handball player",
    "Hall of Fame volleyball player and Olympic coach",
    "silver medalist in decathlon at Summer Olympics",
    "Olympics and Commonwealth Games athletics coach",
    "Olympic silver and bronze medalist weightlifter",
    "× m relay gold medallist at the Summer Olympics",
    "olympian and president of the Olympic Committee",
    "Olympic gold medal winning field hockey player",
    "cross country skier and Olympic Games champion",
    "Olympic gold medal winning cross country skier",
    "Olympic gold and silver medal winning gymnast",
    "hurdler and the nation first Olympic champion",
    "Olympic bronze medal winning alpine ski racer",
    "Olympic bronze medallist field hockey player",
    "Olympic silver medal winning handball player",
    "Olympic weightlifter and world record holder",
    "Olympic and professional road bicycle racer",
    "water polo player and Olympic gold medalist",
    "Olympic silver medal winning triple jumper",
    "Olympic silver medal winning sprint canoer",
    "Olympic bronze medal winning steeplechaser",
    "figure skater and Olympic silver medallist",
    "Olympic silver medal winning figure skater",
    "Olympic speed skating competitor and coach",
    "race walker and Olympic gold medal winner",
    "Olympic silver medal winning weightlifter",
    "Olympic silver and bronze medalist fencer",
    "Olympic silver medal winning speed skater",
    "Olympic medal winning cross country skier",
    "world champion and Olympic figure skater",
    "bronze medal winning Olympic high jumper",
    "m relay gold medalist at Summer Olympics",
    "weightlifter and Olympic silver medalist",
    "Olympic bronze medallist slalom canoeist",
    "Olympic silver medal winning race walker",
    "paraplegic archer and Olympic competitor",
    "Olympic gold medal winning weightlifter",
    "gymnast and three time Olympic medalist",
    "Olympic bronze medallist* sprint canoer",
    "figure skater and Olympic gold medalist",
    "Olympic middle and long distance runner",
    "Olympic shot putter and athletics coach",
    "Olympic silver medallist in ice hockey",
    "Olympic and World Cup alpine ski racer",
    "International Olympic Committee member",
    "gymnast and World and Olympic Champion",
    "Olympic runner and world record holder",
    "West long distance runner and Olympian",
    "Olympic gold medal winning foil fencer",
    "Olympic gold medal winning snowboarder",
    "shot put champion at the and Olympics",
    "coach of Olympic track and field team",
    "world champion and Olympic bobsledder",
    "Olympic skeleton racer and bobsledder",
    "Olympic water polo champion and coach",
    "sprint canoeist and Olympic champion",
    "Olympic champion field hockey player",
    "Olympic silver medal winning gymnast",
    "Olympic champion cross country skier",
    "triple Olympic champion speed skater",
    "Olympic weightlifter and shot putter",
    "Olympic silver medal winning hurdler",
    "first time track and field Olympian",
    "Olympic silver medal winning fencer",
    "Olympic volleyball player and coach",
    "Olympic silver medal winning sailor",
    "Olympic bronze medalist in shot put",
    "Olympic Hall of Fame swimming coach",
    "triple jumper and Olympic champion",
    "Olympic gold medal skiing champion",
    "Olympic gold medal winning gymnast",
    "decathlon Olympic bronze medallist",
    "Olympic handball player and coach",
    "pole vaulter and Olympic champion",
    "gymnast and Olympic gold medalist",
    "alpine skier and mountain climber",
    "weightlifter and Olympic champion",
    "Olympic gold medal winning fencer",
    "sabre fencer and Olympic champion",
    "East Olympic champion long jumper",
    "Olympic gold medal winning sailor",
    "long distance runner and Olympian",
    "Olympic silver medallist gymnast",
    "and Olympic long distance runner",
    "Olympic sailor and administrator",
    "Olympic long distance runner and",
    "Olympian Olympedia Nick Mohammed",
    "Olympic gymnast who won one gold",
    "Olympic fencer and fencing coach",
    "field hockey player and Olympian",
    "East Olympic committee president",
    "Olympic High Jump gold medalist",
    "Olympic bronze medallist runner",
    "Olympic canoeing gold medallist",
    "figure skater and Olympic coach",
    "runner and Olympic long jumper",
    "Olympic taekwondo practitioner",
    "Olympic silver medalist fencer",
    "Olympic silver medalist sailor",
    "Olympic middle distance runner",
    "four time Olympic speed skater",
    "Olympic fencer and illustrator",
    "Olympic champion weightlifter",
    "alpine ski racer and Olympian",
    "Olympic sailor and IOC member",
    "Olympic track and field coach",
    "Olympic shot putter and coach",
    "Olympic long distance runner",
    "Olympic medal winning fencer",
    "West bobsledder and Olympian",
    "gymnast and Olympic Champion",
    "Olympic silver medal winning",
    "Olympic Association official",
    "and horse rider and Olympian",
    "Olympic bobsledder and luger",
    "Olympic champion shot putter",
    "gymnast and Olympic champion",
    "Olympic bronze medal winning",
    "West ski jumper and Olympian",
    "Olympic bobsledder and coach",
    "Olympic field hockey player",
    "Olympic champion in fencing",
    "Olympic basketball official",
    "fencer and Olympic champion",
    "Olympic cross country skier",
    "sailor and Olympic champion",
    "runner and Olympic champion",
    "figure skater and Olympian",
    "bicycle racer and Olympian",
    "Olympic gold medalist in m",
    "Olympic gold medal hurdler",
    "Olympic show jumping rider",
    "skier and Olympic champion",
    "Olympic gold medal winning",
    "Olympic wrestling champion",
    "Olympic hurdler and coach",
    "Olympic volleyball player",
    "and Olympic sport shooter",
    "pole vaulter and Olympian",
    "long jumper and Olympian",
    "Olympic rhythmic gymnast",
    "Olympic skater and coach",
    "Olympic rowing medallist",
    "Olympic volleyball coach",
    "shot putter and Olympian",
    "Olympic alpine skier and",
    "Olympic sailing champion",
    "épée fencer and Olympian",
    "and Olympic sharpshooter",
    "Olympic slalom canoeist",
    "Olympic hurdle medalist",
    "Olympic distance runner",
    "racewalker and Olympian",
    "Olympic champion sailor",
    "Olympic handball player",
    "Olympic champion fencer",
    "Paralympic alpine skier",
    "Olympic bronze medalist",
    "Olympic pistol shooter",
    "and Olympic ski jumper",
    "Olympic biathlon skier",
    "Olympic mountain biker",
    "Olympic swimming coach",
    "Gwich'in Olympic skier",
    "Olympic sport shooter",
    "Olympic skeet shooter",
    "Olympic silver fencer",
    "Nordic combined skier",
    "Olympic figure skater",
    "Olympic steeplechaser",
    "Olympic sprint canoer",
    "Olympic triple jumper",
    "Olympic hockey player",
    "Olympic bicycle racer",
    "Olympic trap shooter",
    "gymnast and Olympian",
    "retired alpine skier",
    "Olympic weightlifter",
    "Olympic pole vaulter",
    "Olympic sharpshooter",
    "Olympic speed skater",
    "sailor and Olympiian",
    "Olympic Nordic skier",
    "Olympic alpine skier",
    "Olympic high jumper",
    "Olympic show jumper",
    "cross country skier",
    "sailor and Olympian",
    "Olympic race walker",
    "and Olympic shooter",
    "Olympic snowboarder",
    "Olympic shot putter",
    "Olympic speedskater",
    "Olympic long jumper",
    "runner and Olympian",
    "Olympian windsurfer",
    "Olympic pair skater",
    "Olympic handballer",
    "cross county skier",
    "Olympic sailor and",
    "Olympic racewalker",
    "Olympic ice skater",
    "Olympic ski jumper",
    "Olympic bobsledder",
    "Olympic windsurfer",
    "Olympic fencer and",
    "professional skier",
    "Olympic clergyman",
    "alpine free skier",
    "para alpine skier",
    "Olympic canoeist",
    "and alpine skier",
    "free style skier",
    "Olympic official",
    "Olympic horseman",
    "Olympic shooter",
    "Olympic gymnast",
    "newschool skier",
    "freestyle skier",
    "Olympic hurdler",
    "Olympic canoer",
    "champion skier",
    "Olympic runner",
    "Olympic skater",
    "Olympic judoka",
    "Olympic sailor",
    "Olympic fencer",
    "Olympic archer",
    "Olympic skier",
    "extreme skier",
    "Olympic luger",
    "alpine skier",
    "Olympian []",
    "water skier",
    "Olympian[]",
    "Olympian",
    "Olympic",
    "skier",
]


sciences = [
    "environmental toxicologist and ornithologist",  # before politics_govt_law
    "aircraft designer and aviation pioneer",  # before law_enf_military_operator
    "medicine pioneer",
]

business_farming = [
    "Rarámuri farmer",
]
academia_humanities = []
law_enf_military_operator = [
    "the first female aviator and the first female combat pilot of the world",
    "aviator and test pilot who twice held the world flight altitude record",
    "security guard wrongly accused of the Atlanta Olympics bombing",
    "aviator who set several aviation records in his teens",
    "RAF officer and Distinguished Flying Cross recipient",
    "Navy aviator and War prisoner of war escapee",
    "aviator who set several world speed records",
    "first female aviator in the Forest Service",
    "Army Major General and division commander",
    "Marine Corps officer and Naval aviator",
    "spymaster and director of the Mossad",
    "military aviator anf fighter ace",
    "Naval and commercial aviator",
    "aviator and airplane racer",
    "RAF officer and flying ace",
    "aviator and record holder",
    "aviator and test pilot",
    "Army Major General and",
    "military patrol skier",
    "pilot and RAF officer",
    "Hall of Fame aviator",
    "Marine Corps aviator",
    "Army Major General",
    "WWII RAF officer",
    "military aviator",
    "aviation pioneer",
    "aviation trainer",
    "aviation officer",
    "pioneer aviator",
    "RAF officer and",
    "and spymaster",
    "KGB spymaster",
    "WRAF officer",
    "WWII aviator",
    "RAF officer",
    "aviator and",
    "spymaster",
    "aviatrix",
    "aviation",
    "pentito",
    "aviator",
]


spiritual = [
    "mahant",
]
social = [
    "air safety consultant",
]
crime = [
    "mobster and member of the Kansas City crime family",
    "mobster and member of the Patriarca crime family",
    "mobster and member of the Bonanno crime family",
    "mobster and member of the Gambino crime family",
    "mobster and boss of the Lucchese crime family",
    "mobster and Lucchese crime family associate",
    "spree killer and murderer of Gianni Versace",
    "mobster and member of the Chicago Outfit",
    "and suspected murderer of Olof Palme",
    "Chicago Outfit and Las Vegas mobster",
    "and father murderer of Marvin Gaye",
    "mobster with the Chicago Outfit",
    "triple murderer and rapist",
    "fugitive and mass murderer",
    "murderer and sex offender",
    "murderer and spree killer",
    "mobster and FBI informant",
    "jewel thief and murderer",
    "organized crime mobster",
    "suspected murderer and",
    "murderer and kidnapper",
    "mobster and bookmaker",
    "gangster and murderer",
    "murderer and convict",
    "rapist and murderer",
    "murderer and robber",
    "murderer and rapist",
    "suspected murderer",
    "vigilante murderer",
    "mass murderer and",
    "double murderer",
    "child murderer",
    "a and mobster",
    "mass murderer",
    "and murderer",
    "mobster and",
    "and mobster",
    "murderer",
    "mobster",
]


event_record_other = [
    "survivor of the Munich massacre",
]
other_species = [
    "Olympic champion dressage horse",  # before sports
    "Olympic eventing horse",
]

<IPython.core.display.Javascript object>

In [1224]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1225]:
# Combining separate lists into one dictionary
known_for_dict = {
    "other_species": other_species,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "arts": arts,
    "sports": sports,
    "politics_govt_law": politics_govt_law,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [1231]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 4min 26s
Wall time: 4min 27s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
82993,18,Paul Kiener,", 74, American director and cinematographer.",https://en.wikipedia.org/wiki/Paul_Kiener,6,2020,July,,,,,,,,,,,,,74.0,,United States of America,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
40224,11,Frank Alamo,", 70, French singer, amyotrophic lateral sclerosis.",https://en.wikipedia.org/wiki/Frank_Alamo,3,2012,October,,,,amyotrophic lateral sclerosis,,,,,,,,,70.0,,France,,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [1232]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 12064 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1315]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1314]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [1313]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "skeptic" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1312]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1311]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "IAS officer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1308]:
# Creating lists for each category
politics_govt_law = [
    "advisor to President Franklin Delano Roosevelt and defector",
    "civil rights advocate and widow of Malcolm X",
    "who was Commissar of all concentration camps",
    "critic of President Ramzan Kadyrov",  # before arts
    "aide de camp to King George VI",
    "Administrative Service officer",
    "and critic of Vladimir Putin",
    "Engineering Service officer",
    "LGBT civil rights advocate",
    "and civil rights advocate",
    "civil rights advocate and",
    "press officer during War",
    "Lord Lieutenant of Moray",
    "public affairs officer",  # before law_enf_military_operator
    "civil rights advocate",
    "later East y official",
    "government officer",
    "cabinet member",
    "social critic",
    "and defector",
    "defector and",
    "IAS officer",
    "defector",
]


arts = [
    "rock critic who worked for Dolls and who signed the Mercury Records while working for",
    "critic and winner of the Pulitzer Prize",
    "longtime theater and cinema critic for",
    "theater critic and television host",
    "magician and television performer",
    "literary and cultural critic",
    "theater critic and editor of",
    "newspaper editor and critic",
    "Boston based theater critic",
    "magazine editor and critic",
    "magician and ventriloquist",
    "designer and dance critic",
    "media and cultural critic",
    "theatre critic and editor",
    "escapologist and magician",
    "magician and entertainer",
    "theatre and dance critic",
    "media critic and blogger",
    "music and theatre critic",
    "classical music critic",
    "critic of literature",
    "theatre critic for",
    "rock music critic",
    "restaurant critic",
    "music critic and",
    "and food critic",
    "cultural critic",
    "stage magician",
    "fashion critic",
    "theatre critic",
    "press officer",
    "gaming critic",
    "ballet critic",
    "card magician",
    "music critic",
    "and magician",
    "dance critic",
    "opera critic",
    "manga critic",
    "movie critic",
    "magician and",
    "media critic",
    "wine critic",
    "food critic",
    "and critic",
    "magician",
    "critic",
]

sports = [
    "and former President of the ICSD",
]
sciences = [
    "critic of alternative medicine",
    "cybersecurity innovator",
    "video game developer",
    "bioweaponeer",
    "skeptic and",
    "skeptic",
]


business_farming = []
academia_humanities = [
    "salvage expert",
    "Urdu",
]
law_enf_military_operator = [
    "Army Air Forces officer and recipient of the Medal of Honor",
    "Directorate of Operations officer for the CIA stationed in",
    "officer in the Army and recipient of the Victoria Cross",
    "Royal Naval officer and recipient of the George Cross",
    "Naval officer and recipient of the Medal of Honor",
    "Marine Corps officer and Medal of Honor recipient",
    "officer and international relations specialist",
    "Air Forces officer during the Second World War",
    "officer in the Coast and Geodetic Survey Corps",
    "air force officer and Medal of Honor recipient",
    "Navy officer and Medal of Honor recipient",
    "highly decorated Navy submarine officer",
    "Navy officer during the Missile Crisis",
    "officer in the Army of the Republic of",
    'CIA officer known as "the Blond Ghost"',
    "freedom fighter and military commander",
    "Royal Air Force officer and flying ace",
    "Special Operations Executive officer",
    "air force officer and fighter pilot",
    "officer during the Second World War",
    "Army military intelligence officer",
    "CIA and Naval Intelligence officer",
    "security and intelligence officer",
    "military and intelligence officer",
    "and officer and Hero of the Union",
    "senior officer in the Royal Navy",
    "and former Marine Corps officer",
    "operations officer in the C I A",
    "officer of the Pontifical Guard",
    "military  intelligence officer",
    "military intelligence officer",
    "officer in the Wehrmacht of y",
    "spy and intelligence officer",
    "Gandhian and freedom fighter",
    "officer and National Hero of",
    "Royal Observer Corps officer",
    "counter intelligence officer",
    "infantry officer in the Army",
    "Air Forces pilot and officer",
    "Navy officer and flying ace",
    "Merchant Marine officer and",
    "SS non commissioned officer",
    "counterintelligence officer",
    "submariner and Navy officer",
    "chief intelligence officer",
    "army and air force officer",
    "navy officer and informant",
    "Air Force officer and spy",
    "officer with Easy Company",
    "army intelligence officer",
    "officer in the Royal Navy",
    "military warrant officer",
    "Army Nurse Corps officer",
    "Women Army Corps officer",
    "non commissioned officer",
    "and intelligence officer",
    "intelligence officer and",
    "Americane Marine officer",
    "spy and freedom fighter",
    "Army Air Forces officer",
    "noncommissioned officer",
    "law enforcement officer",
    "foreign service officer",
    "MI intelligence officer",
    "secret service officer",
    "officer in People Army",
    "Army Air Corps officer",
    "Special Forces officer",
    "and a freedom fighter",
    "Army infantry officer",
    "seaman and a officer",
    "intelligence officer",
    "Armed Forces officer",
    "correctional officer",
    "Marine Corps officer",
    "Navy U boat officer",
    "officer in the Navy",
    "officer during WWII",
    "y Wehrmacht officer",
    "and freedom fighter",
    "coast guard officer",
    "officer in the Army",
    "freedom fighter and",
    "Coast Guard officer",
    "Royal Navy officer",
    "government officer",
    "scientific officer",
    "resistance officer",
    "Waffen SS officer",
    "Navy flag officer",
    "CIA field officer",
    "Air Force officer",
    "air force officer",
    "Naval officer and",
    "nazi officer and",
    "colonial officer",
    "commando officer",
    "CIA case officer",
    "security officer",
    "freedom fighter",
    "liaison officer",
    "KGB officer and",
    "customs officer",
    "and KGB officer",
    "officer of arms",
    "prison officer",
    "officer of the",
    "Forces officer",
    "SS officer and",
    "officer in the",
    "Naval officer",
    "nazi officer",
    "Navy officer",
    "navy officer",
    "KGB officer",
    "air officer",
    "officer and",
    "SOE officer",
    "CIA officer",
    "and officer",
    "MI officer",
    "SS officer",
    "officer",
]


spiritual = [
    "Catholic archbishop and the acting head of the UGCC",
    "born Catholic bishop and Nobel Peace Prize nominee",
    "primate and Archbishop of the Anglican Church of",
    "bishop of the Episcopal Diocese of Massachusetts",
    "first bishop of the Catholic Diocese of Alleppey",
    "Jewish Catholic Archbishop Emeritus of Paris",
    "Christian preacher and Pentecostal bishop",
    "Catholic auxiliary bishop of Grand Rapids",
    "Episcopal bishop of Dallas and Fort Worth",
    "Anglican and st Archbishop of Canterbury",
    "Catholic bishop and prelate of Opus Dei",
    "bishop of the Episcopal Diocese of West",
    "Melkite Catholic Archbishop of Baalbek",
    "Catholic archbishop of Ribeirão Preto",
    "bishop of the United Methodist Church",
    "Syro Malabar archbishop and cardinal",
    "former Catholic Archbishop of Vienna",
    "primate and Archbishop of Cape Town",
    "catholic prelate and Archbishop of",
    "Catholic Archbishop of Birmingham",
    "former inter and Church of bishop",
    "Archbishop of the Orthodox Church",
    "bishop of Church of God in Christ",
    "Catholic Archbishop of Kingston",
    "Catholic archbishop of Verapoly",
    "Archbishop Emeritus of Karachi",
    "bishop of the Methodist Church",
    "bishop of the Episcopal Church",
    "oldest Catholic bishop in the",
    "bishop in the Catholic Church",
    "Catholic sedevacantist bishop",
    "bishop of the Catholic Church",
    "Catholic clergyman and bishop",
    "Anglican bishop of Willochra",
    "Chaldean Catholic archbishop",
    "retired Anglican archbishop",
    "Catholic bishop of Pueblo",
    "Archbishop of Canterbury",
    "Catholic bishop of Macau",
    "Episcopal Church bishop",
    "Catholic titular bishop",
    "bishop in the Church of",
    "ian Catholic bishop of",
    "archbishop of Papeete",
    "First Catholic bishop",
    "fifth Catholic bishop",
    "Archbishop of Palermo",
    "Archbishop of Utrecht",
    "Archbishop of Mariana",
    "Orthodox Archbishop",
    "Catholic Archbishop",
    "Catholic bishop and",
    "Anglican archbishop",
    "suffragan bishop in",
    "Catholic archbishop",
    "Orthodox archbishop",
    "bishop of Owando",
    "Methodist bishop",
    "Episcopal bishop",
    "Catholic bishop",
    "Lutheran bishop",
    "Orthodox bishop",
    "Anglican bishop",
    "textbook critic",
    "catholic bishop",
    "Archbishop of",
    "Archbishop",
    "bishop and",
    "archbishop",
    "bishop",
]


social = []
crime = [
    "commander of the Treblinka extermination camp",
]
event_record_other = [
    "woman murdered by a Minneapolis Police officer",  # before law_enf_military_operator
]
other_species = []

<IPython.core.display.Javascript object>

In [1309]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1310]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "law_enf_military_operator": law_enf_military_operator,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [1233]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "general" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

In [856]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean11.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean.db")
# df.to_sql("wp_life_expect_clean", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 11]()