# Wikipedia Notable Life Expectancies
# [Notebook 12: Data Cleaning Part 11](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean11_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean10.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean10", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98056 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98054,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98055,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
30733,4,Etta Cameron,", 70, Bahamian-born Danish gospel singer, cancer.",https://en.wikipedia.org/wiki/Etta_Cameron,3,2010,March,,,,cancer,,,,,,,,,70.0,,The Bahamas,Denmark,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1
42105,18,"Alger Chapman, Jr.",", 81, American finance executive, CEO and Chairman of the Chicago Board Options Exchange , heart failure.","https://en.wikipedia.org/wiki/Alger_Chapman,_Jr.",4,2013,February,,,finance executive,CEO and Chairman of the Chicago Board Options Exchange,heart failure,,,,,,,,81.0,,United States of America,,1986 1997,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0
35254,21,Ashleigh Connor,", 21, Australian soccer player, car accident.",https://en.wikipedia.org/wiki/Ashleigh_Connor,5,2011,July,,,soccer player,car accident,,,,,,,,,21.0,,Australia,,,1.791759,0,0,0,0,0,0,0,0,0,0,0,0,0
88811,30,Josefina Cuesta,", 74, Spanish historian and academic.",https://en.wikipedia.org/wiki/Josefina_Cuesta,8,2021,March,,,,,,,,,,,,,74.0,,Spain,,,2.197225,0,0,0,1,0,0,0,0,0,0,0,0,1
52962,19,Michael J. D. Powell,", 78, British mathematician.",https://en.wikipedia.org/wiki/Michael_J._D._Powell,11,2015,April,,,,,,,,,,,,,78.0,,United Kingdom of Great Britain and Northern Ireland,,,2.484907,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98056 entries, 0 to 98055
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98056 non-null  object 
 1   name                       98056 non-null  object 
 2   info                       98056 non-null  object 
 3   link                       98056 non-null  object 
 4   num_references             98056 non-null  int64  
 5   year                       98056 non-null  int64  
 6   month                      98056 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98024 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [76]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [75]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "publisher" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [74]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [73]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "prima ballerina" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [72]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "Holocaust denial" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [71]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "Bible publisher"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [82]:
# Creating lists for each category
politics_govt_law = [
    "who fought two First Amendment battles during the s",
    "Amazon environmentalist and conservationist",
    "conservationist and environmentalist",
    "environmentalist and conservationist",
    "wildlife conservationist and",
    "wildlife conservationist",
    "animal conservationist",
    "nature conservationist",
    "marine conservationist",
    "canal conservationist",
    "and conservationist",
    "conservationist",
    "university student",  # before academia_humanities
    "royal of the Pahlavi dynasty and elder sister of the last Shah of",
    "royal and sister of Queen Elizabeth II",
    "royal prince and Communist leader",
    "father of royal divorcée Sarah",
    "member of the royal family",
    "claimant of royalty",  # also ambassador of Navy League of US
    "traditional royal",
    "Māori royal elder",
    "Holocaust denial",
    "Manchurian royal",
    "royal courtier",
    "royal consort",
    "royal steward",
    "royal servant",
    "former royal",
    "and royal",
    "royal and",
    "royalist",
    "royalty",
    "royal",
]

arts = [
    "guitarist for the heavy metal band Great White; victim of the Station nightclub fire",
    "guitarist who played with Frank Sinatra and on The Tonight Show orchestra",
    "blues guitarist who influenced Otis Redding and Jimi Hendrix",
    "guitarist and a founder of rock band Molly Hatchet",
    "bass guitarist for the punk band The Dead Milkmen",
    "guitarist for Madrugada and My Midnight Creeps",
    "rock guitarist who founded The Replacements",
    "jazz and blues rock guitarist and vocalist",
    "rockabilly guitarist and record producer",
    "post punk guitarist and record producer",
    "guitarist and founding member of Foghat",
    "guitarist with The Marshall Tucker Band",
    "jazz double bassist and bass guitarist",
    "free improvising avant garde guitarist",
    "guitarist from rock outfit Rose Tattoo",
    "hardcore punk drummer and guitarist",
    "heavy metal bassist and guitarist",
    "bluegrass guitarist and banjoist",
    "guitarist and surf music pioneer",
    "Hall of Fame bluegrass guitarist",
    "guitarist and founding member of",
    "guitarist for the Cosmic Psychos",
    "rock and country music guitarist",
    "guitarist and ethnomusicologist",
    "blues guitarist and keyboardist",
    "rhythm guitarist for Body Count",
    "rock keyboardist and guitarist",
    "rockabilly and swing guitarist",
    "guitarist and record producer",
    "Hall of Fame blues guitarist",
    "bass guitarist and vocalist",
    "lead guitarist and vocalist",
    "guitarist of rock band Ratt",
    "rhythm and blues guitarist",
    "guitarist and chess master",
    "keyboardist and guitarist",
    "electric blues guitarist",
    "Piedmont blues guitarist",
    "blues and rock guitarist",
    "guitarist and bookseller",
    "punk and metal guitarist",
    "piedmont blues guitarist",
    "rock and roll guitarist",
    "jazz and soul guitarist",
    "country music guitarist",
    "Hall of Fame guitarist",
    "guitarist and vocalist",
    "jazz and R&B guitarist",
    "guitarist and arranger",
    "inventor and guitarist",
    "bass guitarist for EMF",
    "guitarist and inventor",
    "heavy metal guitarist",
    "pedal steel guitarist",
    "bassist and guitarist",
    "Head Charge guitarist",
    "death metal guitarist",
    "blues rock guitarist",
    "rockabilly guitarist",
    "bluegrass guitarist",
    "rock bass guitarist",
    "primitive guitarist",
    "punk rock guitarist",
    "Lap steel guitarist",
    "slack key guitarist",
    "flamenco guitarist",
    "country guitarist",
    "session guitarist",
    "rhythm guitarist",
    "studio guitarist",
    "reggae guitarist",
    "blues guitarist",
    "steel guitarist",
    "metal guitarist",
    "roots guitarist",
    "bass guitarist",
    "rock guitarist",
    "folk guitarist",
    "soul guitarist",
    "guitarist",
    "drummer for the bands Trouble and Zoetrope",
    "Hall of Fame record producer and drummer",
    "drummer and co founder of the band Space",
    "drummer for rock band Jefferson Airplane",
    "drummer for rock band Doobie Brothers",
    "drummer for rock band Bleed the Dream",
    "drummer for the glam rock band Sweet",
    "drummer with s soft rock band Bread",
    "Hall of Fame drummer and lyricist",
    "hard rock and heavy metal drummer",
    "drummer for glam rock band Mud",
    "jazz drummer and bandleader on",
    "jazz drummer and vibraphonist",
    "Cheyenne drummer and vocalist",
    "one time drummer for The Who",
    "drummer and record producer",
    "drummer with The Searchers",
    "Hall of Fame rock drummer",
    "drummer and percussionist",
    "rhythm and blues drummer",
    "session drummer for ABBA",
    "avant garde jazz drummer",
    "former Shadows drummer",
    "funk and jazz drummer",
    "hardcore punk drummer",
    "Hall of Fame drummer",
    "jazz and R&B drummer",
    "drummer and vocalist",
    "swing jazz drummer",
    "free jazz drummer",
    "drummer; recorded",
    "jazz pop drummer",
    "New Wave drummer",
    "session drummer",
    "reggae drummer",
    "studio drummer",
    "former drummer",
    "blues drummer",
    "metal drummer",
    "Vodou drummer",
    "swing drummer",
    "jazz drummer",
    "rock drummer",
    "drummer",
    "animator for the Walt Disney Company",
    "animator and television producer",
    "animator and animation director",
    "animation director and producer",
    "Academy Award winning animator",
    "animator and a director",
    "and animation director",
    "animator and director",
    "animator and producer",
    "stop motion animator",
    "background animator",
    "animation director",
    "Disney animator",
    "animator and",
    "and animator",
    "animator",
    "prima ballerina and",
    "and prima ballerina",
    "prima ballerina",
    "comic book publisher and comic book character creator",
    "publisher of auto industry and enthusiast magazines",
    "newspaper executive and publisher of from to",
    "former publisher of the Lewy body disease",
    "theater producer and newspaper publisher",
    "fashion magazine publisher and editor",
    "businesswoman and newspaper publisher",
    "science fiction editor and publisher",
    "record producer and music publisher",
    "pornographic magazine publisher and",
    "newspaper executive and publisher",
    "comic book producer and publisher",
    "science fiction fanzine publisher",
    "editor and publisher of magazine",
    "comic book editor and publisher",
    "newspaper editor and publisher",
    "newspaper publisher and editor",
    "cookbook editor and publisher",
    "magazine publisher and editor",
    "publisher and chairman of the",
    "music publisher and executive",
    "game designer and publisher",
    "music manager and publisher",
    "publisher and pornographer",
    "book editor and publisher",
    "science fiction publisher",
    "publisher and pamphleteer",
    "and numismatic publisher",
    "rock magazine publisher",
    "newspaper publisher and",
    "publisher and producer",
    "independent publisher",
    "publisher and editor",
    "editor and publisher",
    "comic book publisher",
    "newspaper publisher",
    "magazine publisher",
    "literary publisher",
    "cookbook publisher",
    "website publisher",
    "music publisher",
    "media publisher",
    "manga publisher",
    "Bible publisher",
    "book publisher",
    "publisher and",
    "and publisher",
    "publisher of",
    "publisher",
]
sports = [
    "canoe maker",
    "international rugby player",
    "rugby player and executive",
    "wrestler and rugby player",
    "cricket and rugby player",
    "wheelchair rugby player",
    "rugby player and coach",
    "rugby player for and",
    "rugby player and",
    "and rugby player",
    "rugby player",
    "Olympic bronze medal winning track cyclist",
    "Paralympic bronze medal winning cyclist",
    "Olympic silver medal winning cyclist",
    "bronze medal winning Olympic cyclist",
    "Olympic cyclist and silver medalist",
    "Olympic cyclist and cycling coach",
    "Olympic silver medallist cyclist",
    "Olympic cyclist and rally driver",
    "Olympic and professional cyclist",
    "Olympic silver medalist cyclist",
    "Olympic cyclist gold medalist",
    "cyclist and Olympic medallist",
    "speed skater and road cyclist",
    "Olympic silver medal cyclist",
    "long distance motorcyclist",
    "track cyclist and Olympian",
    "track and road cyclist",
    "Olympic track cyclist",
    "cross country cyclist",
    "long distance cyclist",
    "professional cyclist",
    "cyclist and Olympian",
    "swimmer and cyclist",
    "rally motorcyclist",
    "cyclist and coach",
    "endurance cyclist",
    "rower and cyclist",
    "Olympic cyclist",
    "track cyclist",
    "motorcyclist",
    "road cyclist",
    "BMX cyclist",
    "and cyclist",
    "cyclist",
]

sciences = [
    "biologist who made important contributions to fisheries science",
    "biologist and a pioneer in the field of vitro fertilization",
    "molecular biologist at the National Cancer Institute",
    "behavioral ecologist and evolutionary biologist",
    "and director of the Fish and Wildlife Service",
    "marine biologist and comparative physiologist",
    "microbiologist suspected of anthrax attacks",
    "Nobel Prize winning molecular biologist",
    "geneticist and developmental biologist",
    "ornithologist and molecular biologist",
    "geneticist and evolutionary biologist",
    "immunologist and molecular biologist",
    "cell biologist and cancer researcher",
    "molecular biologist and virologist",
    "ichthyologist and marine biologist",
    "geneticist and molecular biologist",
    "palaeobiologist and astrobiologist",
    "microbiologist and epidemiologist",
    "microbiologist and Nobel laureate",
    "biologist and plant physiologist",
    "cell and developmental biologist",
    "biologist and evolution theorist",
    "oneirologist and neurobiologist",
    "immunologist and microbiologist",
    "microbiologist and immunologist",
    "nutritionist and microbiologist",
    "biologist and environmentalist",
    "biologist and cryptozoologist",
    "biologist and skull collector",
    "microbiologist and geneticist",
    "microbiologist and virologist",
    "virologist and microbiologist",
    "molecular and cell biologist",
    "biologist and epistemologist",
    "botanist and field biologist",
    "botanist and microbiologist",
    "biologist and oceanographer",
    "ornithologist and biologist",
    "microbiologist and botanist",
    "biologist and physiologist",
    "microbiologist at Stanford",
    "who founded Monkey World",
    "biologist and geneticist",
    "biologist and researcher",
    "developmental biologist",
    "biologist and zoologist",
    "computational biologist",
    "biologist and ecologist",
    "World War II  biologist",
    "ecologist and biologist",
    "and molecular biologist",
    "wildlife biologist and",
    "evolutionary biologist",
    "medical microbiologist",
    "biologist and botanist",
    "reproductive biologist",
    "theoretical biologist",
    "agriculture biologist",
    "and marine biologist",
    "structural biologist",
    "veterinary biologist",
    "molecular biologist",
    "fisheries biologist",
    "radiation biologist",
    "microbiologist and",
    "cell biologist and",
    "wildlife biologist",
    "cellular biologist",
    "geomicrobiologist",
    "genetic biologist",
    "crocodile expert",
    "marine biologist",
    "chronobiologist",
    "plant biologist",
    "giraffe expert",
    "microbiologist",
    "cell biologist",
    "neurobiologist",
    "astrobiologist",
    "radiobiologist",
    "paleobiologist",
    "oncobiologist",
    "biologist and",
    "and biologist",
    "forester and",
    "biologist",
    "forester",
    "botanist and pioneer of plant ecology and environmental studies",
    "botanist specialising in South flora and fauna",
    "botanist and plant pathologist",
    "botanist and plant taxonomist",
    "botanist and plant ecologist",
    "botanist and plant collector",
    "botanist and lepidopterist",
    "Jesuit priest and botanist",
    "botanist and lichenologist",
    "botanist and pteridologist",
    "botanist and entomologist",
    "botanist and a mycologist",
    "botanist and taxonomist",
    "botanist and mycologist",
    "botanist and geneticist",
    "naturalist and botanist",
    "botanist and bryologist",
    "pharmacist and botanist",
    "botanist and ecologist",
    "biologist and botanist",
    "taxonomic botanist",
    "archaeobotanist",
    "palaeobotanist",
    "ethnobotanist",
    "paleobotanist",
    "botanist and",
    "and botanist",
    "botanist",
]

business_farming = ["sherry trader", "property magnate"]
academia_humanities = [
    "educationalist and university administrator",
    "and university administrator",
    "university vice chancellor",
    "and university president",
    "university administrator",
    "and university lecturer",
    "university president",
    "university executive",
    "university director",
    "university official",
    "university",
]
law_enf_military_operator = []
spiritual = [
    "Capuchin friar",
]
social = []
crime = []
event_record_other = []
other_species = [
    "royal cocker spaniel",  # before politics_govt_law
]

<IPython.core.display.Javascript object>

In [83]:
# Hard-coding cause_of_death for entries with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Ty_Longley"].index
df.loc[index, "cause_of_death"] = "nightclub fire"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Otis_Chandler"].index
df.loc[index, "cause_of_death"] = "Lewy body disease"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [84]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "business_farming": business_farming,
    "sciences": sciences,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "politics_govt_law": politics_govt_law,
    "academia_humanities": academia_humanities,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [85]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 6min 11s
Wall time: 6min 11s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
96075,13,Fabio Restrepo,", 62, Colombian actor , COVID-19.",https://en.wikipedia.org/wiki/Fabio_Restrepo_(actor),4,2022,February,",",,,COVID,,,,,,,,,62.0,,Colombia,,",",1.609438,0,0,0,0,0,1,0,0,0,0,0,0,1
5942,27,Samuel L. Devine,", 81, American politician, cancer.",https://en.wikipedia.org/wiki/Samuel_L._Devine,5,1997,June,,,,cancer,,,,,,,,,81.0,,United States of America,,,1.791759,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [86]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 22092 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [88]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [163]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [142]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "convicted" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [162]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [161]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "latinist" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [160]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "injustice claimant" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [103]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "Special Operations Executive agent" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [159]:
# # Example code to quick-check a specific entry
# df[
#     df["info_2"]
#     == 'convicted and later pardoned of being World War II propagandist "Tokyo Rose"'
# ]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [157]:
# Creating lists for each category
politics_govt_law = [
    "solicitor wrongly convicted of killing two of her sons",  # before crime
    "convicted and later pardoned of being World War II",
    "man wrongfully convicted of rape and abduction",
    "civil rights leader and a black nationalist",
    "World War II Vichy government official",
    "janitor wrongly convicted of murder",
    "attorney and civil rights leader",
    "labor and civil rights leader",
    "Romani civil rights leader",
    "mass media advocate",
    "civil rights leader",
    "injustice claimant",
    "anti fascist",
]

arts = [
    "cinematographer and two time Academy Award winner",
    "second unit director and cinematographer",
    "cinematographer and second unit director",
    "television producer and cinematographer",
    "cinematographer and camera operator",
    "music business executive and agent",  # before business_farming
    "cinematographer for Orson Welles",
    "cinematographer for the movie",
    "cinematographer and director",
    "director and cinematographer",
    "cinematographer and producer",
    "music business executive",
    "cinematographer Oscar",
    "and cinematographer",
    "cinematographer and",
    "cinematographer",
    "early video game designer",
    "video game designer and",
    "video game designer",
    "rapper better known as 'Proof' of D",
    "hardcore rapper and record producer",
    "rapper and harmonica player",
    "rapper and record producer",
    'propagandist "Tokyo Rose"',
    "rapper and Onyx affiliate",
    "rapper and TV presenter",
    "rapper and disc jockey",
    "rapper and producer",
    "Freestyle rapper",
    "rapper producer",
    "DJ and rapper",
    "rapper and DJ",
    "rapper and",
    "rapper",
]
sports = [
    "co owner of the New Nets",
    "trainer and owner of Thoroughbred racehorses",  # before other_species
    "Hall of Fame Thoroughbred racehorse trainer",
    "Hall of Fame racetrack and racehorse owner",
    "Thoroughbred racehorses trainer and owner",
    "Thoroughbred racehorse owner and breeder",
    "property developer and racehorse owner",
    "football club and racehorse owner",
    "thoroughbred racehorses trainer",
    "thoroughbred racehorse trainer",
    "Thoroughbred racehorse breeder",
    "hall of fame racehorse trainer",
    "Thoroughbred racehorse trainer",
    "jockey and racehorse trainer",
    "thoroughbred racehorse owner",
    "racehorse jockey and trainer",
    "racehorse trainer and owner",
    "racehorse owner and breeder",
    "racehorse trainer",
    "racehorse owner",
    "freeskier",
]
sciences = [
    "creator of the Game Boy",
]

business_farming = [
    "Hall of Fame business executive",
    "business executive and",
    "and business executive",
    "business executive",
]
academia_humanities = [
    "latinist",
]
law_enf_military_operator = [
    "Special Operations Executive agent in World War II",
    "World War II messenger pigeon trainer",
    "Resistance fighter during World War II",
    "Special Operations Executive agent",
    "jihadist and ISIL commander",
    "Resistance fighter",
    "jihadist militant",
    "jihadist",
]
spiritual = [
    "Hasidic rebbe",
    "Anglican priest and chaplain",
    "nun and Anglican priest",
    "Anglican priest and",
    "Anglican priest",
    "believed to be oldest living Catholic priest",
    "Catholic priest and marriage counselor",
    "excommunicated Hare Krishna leader",
    "Catholic priest and Sedevacantist",
    "Traditionalist Catholic priest",
    "Catholic priest and archbishop",
    "Catholic priest and missionary",
    "Catholic priest and exorcist",
    "Catholic priest and Jesuit",
    "Chaldean Catholic priest",
    "Jesuit Catholic priest",
    "Catholic priest and",
    "Catholic priest",
    "interfaith",
]
social = [
    "humanitarian aid worker and",
    "Wiradjuri humanitarian",
    "turned humanitarian",
]
crime = [
    "Klansman and convicted accomplice in the Freedom Summer murders",
    "gangster and suspected murderer of rapper Tupac Shakur",  # before arts
    "convicted of deporting Jews to death camps",
    "convicted war criminalconvicted fraudster",
    "convicted spammer and prison escapee",
    "convicted kidnapper and carjacker",
    "and convicted attempted murderer",
    "convicted of spying for the USSR",
    "convicted of drug trafficking in",
    "convicted of Lillehammer murder",
    "mobster and convicted racketeer",
    "convicted in Ipperwash shooting",
    "convicted child sex abuser and",
    "convicted in Watergate scandal",
    "convicted kidnapper and rapist",
    "and convicted state terrorist",
    "and convicted triple murderer",
    "and convicted drug trafficker",
    "convicted robber and murderer",
    "convicted of drug trafficking",
    "and convicted manslaughterer",
    "convicted child sex offender",
    "and convicted child abuser",
    "and convicted sex offender",
    "and convicted war criminal",
    "convicted double murderer",
    "convicted people smuggler",
    "convicted malice murderer",
    "convicted  and kidnapper",
    "and convicted fraudster",
    "and convicted terrorist",
    "and convicted fraudster",
    "man convicted of murder",
    "convicted serial rapist",
    "convicted mass murderer",
    "and convicted criminal",
    "convicted child rapist",
    "convicted war criminal",
    "convicted spree killer",
    "convicted child rapist",
    "convicted sex offender",
    "convicted extortionist",
    "convicted drug dealer",
    "convicted conspirator",
    "convicted of treason",
    "and convicted killer",
    "and convicted felon",
    "convicted fraudster",
    "convicted drug lord",
    "convicted terrorist",
    "convicted insurgent",
    "convicted kidnapper",
    "convicted criminal",
    "convicted procurer",
    "convicted con man",
    "convicted plotter",
    "convicted killer",
    "convicted felon",
    "convicted spy",
    "and convicted",
]
event_record_other = [
    "Holocaust survivor following escape from Sobibór",
    "Jewish Holocaust survivor",
    "Holocaust survivor and",
    "Holocaust survivor",
    "kidnap victim",
    "ISIS hostage",
]
other_species = [
    "thoroughbred racehorse and Hall of Fame inductee",
    "Thoroughbred racehorse and champion sire",
    "trained thoroughbred racehorse and sire",
    "Thoroughbred racehorse and active sire",
    "Thoroughbred racehorse and broodmare",
    "Hall of Fame Thoroughbred racehorse",
    "Hall of Fame thoroughbred racehorse",
    "thoroughbred racehorse and sire",
    "Thoroughbred racehorse and sire",
    "Champion Thoroughbred racehorse",
    "Hall of Fame racehorse and sire",
    "trained Thoroughbred racehorse",
    "trained thoroughbred racehorse",
    "champion racehorse and sire",
    "racehorse and sire old age",
    "racehorse and broodmare",
    "National Hunt racehorse",
    "Thoroughbred racehorse",
    "thoroughbred racehorse",
    "Standardbred racehorse",
    "Hall of Fame racehorse",
    "standardbred racehorse",
    "racehorse and sire",
    "champion racehorse",
    "trained racehorse",
    "harness racehorse",
    "racehorse",
]

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [158]:
# Combining separate lists into one dictionary
known_for_dict = {
    "politics_govt_law": politics_govt_law,
    "crime": crime,
    "arts": arts,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "law_enf_military_operator": law_enf_military_operator,
    "event_record_other": event_record_other,
    "sports": sports,
    "other_species": other_species,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [164]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 59s
Wall time: 2min


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
19239,10,Humbert Balsan,", 50, French film producer.",https://en.wikipedia.org/wiki/Humbert_Balsan,5,2005,February,,,,,,,,,,,,,50.0,,France,,,1.791759,0,0,0,0,0,1,0,0,0,0,0,0,1
26563,2,George Anselevicius,", 85, Lithuanian-born American architect.",https://en.wikipedia.org/wiki/George_Anselevicius,8,2008,October,,,,,,,,,,,,,85.0,,Lithuania,United States of America,,2.197225,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [165]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 21093 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [222]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [221]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [220]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "peer" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [219]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [218]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "executive and peer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [217]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "Royal Naval Volunteer Reserve officer" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [216]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "chief executive and peer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [215]:
# Creating lists for each category
politics_govt_law = [
    "Daughter of fascist dictator Benito Mussolini",
    "former dictator of",
    "dictator and",
    "dictator",
    "courtier of Queen Elizabeth II",
    "Minister of the Interior",
    "anti Zionist campaigner",
    "aristocrat and courtier",
    "aristocrat and Senator",
    "hereditary peer",
    "aristocrat and",
    "aristocrat",
    "hereditary peer of the Peerage of",
    "peer and former Northern Senator",
    "peer and Governor of the Bank of",
    "peer and son of  John Buchan",
    "and Conservative life peer",
    "and Liberal Democrat peer",
    "Labour MP and life peer",
    "barrister and life peer",
    "and hereditary peer",
    "peer and landowner",
    "landowner and peer",
    "peer and solicitor",
    "hereditary peer",
    "and a life peer",
    "life peer and",
    "and life peer",
    "life peer",
    "peeress",
    "peer and",
    "and peer",
    "peer",
]

arts = [
    "talent scout and agent known for her impact on",  # before social
    "modeling agent and scout",
    "model scout",
]
sports = [
    "track and field athlete who won four gold medals at the Summer Olympics",
    "track and field athlete and hammer throw world record holder",
    "athlete at the Summer Olympics and oldest surviving Olympian",
    "sport shooter and Olympic silver medal winning pentathlete",
    "Olympic gold medal winning modern pentathlete",
    "track and field athlete and Olympic champion",
    "track and field athlete and olympic champion",
    "athlete and Olympic gold medallist in discus",
    "athlete and the first Olympic gold medalist",
    "Olympic bronze medallist shot put athlete",
    "athlete and winner of the first Olympic m",
    "olympic athlete and long distance runner",
    "modern pentathlete and Olympic champion",
    "track athlete and Olympic gold medalist",
    "Olympic gold medal winning decathlete",
    "swimming coach and modern pentathlete",
    "Olympic modern pentathlete and fencer",
    "racing manager to Queen Elizabeth II",
    "Olympic bronze medal winning athlete",
    "track and field athlete and Olympian",
    "Olympic gold medal winning biathlete",
    "Olympic champion modern pentathlete",
    "triathlete and long distance runner",
    "centenarian track and field athlete",
    "Olympic gold medal winning athlete",
    "modern pentathlete and horse rider",
    "international athlete and Olympian",
    "athlete and Olympic hockey player",
    "Olympic athlete and administrator",
    "ski jumping athlete and official",
    "Olympic track and field athlete",
    "Olympic middle distance athlete",
    "masters track and field athlete",
    "Olympic bronze medalist athlete",
    "athlete and field hockey coach",
    "mountain biker and triathlete",
    "athlete and Olympic champion",
    "athlete and graphic designer",
    "high jumper and pentathlete",
    "Olympic modern pentathlete",
    "biathlon athlete and coach",
    "long jumper and decathlete",
    "Olympic athlete and coach",
    "Olympic long jump athlete",
    "Olympic decathlon athlete",
    "Olympic champion athlete",
    "athlete and book subject",
    "track and field athlete",
    "Senior Olympics athlete",
    "nonagenarian triathlete",
    "middle distance athlete",
    "and multi sport athlete",
    "modern pentathlete and",
    "Olympic track athlete",
    "Hall of Fame athlete",
    "athlete and Olympian",
    "steeplechase athlete",
    "college athlete and",
    "Olympic pentathlete",
    "racewalking athlete",
    "champion decathlete",
    "modern pentathlete",
    "Olympic decathlete",
    "Paralympic athlete",
    "paralympic athlete",
    "paraplegic athlete",
    "Olympic triathlete",
    "polo administrator",
    "athlete and coach",
    "Olympic biathlete",
    "versatile athlete",
    "Olympic athlete",
    "masters athlete",
    "Masters athlete",
    "college athlete",
    "sprint athlete",
    "track athlete",
    "field athlete",
    "heptathlete",
    "decathlete",
    "triathlete",
    "biathlete",
    "athlete and",
    "athlete",
    "Hall of Fame cricket player and coach",
    "cricket player and administrator",
    "cricket player and manager",
    "cricket player and umpire",
    "cricket player and coach",
    "test cricket player",
    "cricket player and",
    "cricket player",
    "master archer",
    "baseball scout and executive",  # before social
    "ice hockey coach and scout",
    "football manager and scout",
    "football scout",
    "Baseball scout",
    "baseball scout",
]
sciences = [
    "microbiology",
]

business_farming = [
    "former chairman of Rolls Royce",
    "proprietor of the",
]
academia_humanities = [
    "President of Bar Ilan University",
    "Bible researcher",
    "rosh yeshiva",
    "and lecturer",
    "tour guide",
    "lecturer",
]
law_enf_military_operator = [
    "chief military",
    "Army officer during World War II and Medal of Honor recipient",
    "Army officer and recipient of the Param Vir Chakra",
    "Army officer and recipient of the Maha Vir Chakra",
    "Army officer and recipient of the Medal of Honor",
    "Army officer associated with the Mỹ Lai massacre",
    "Army officer and Chief of the General Staff",
    "Royal Naval Volunteer Reserve officer",
    "Army officer and Commander in Chief",
    "th Army officer and Chief of Staff",
    "Army officer and OSS CIA operative",
    "Army officer during World War II",
    "Army officer and Chief of Staff",
    "People Liberation Army officer",
    "World War II Army officer",
    "Army officer and military",
    "and Army officer",
    "Army officer and",
    "Army officer",
    "cipher clerk",
]
spiritual = [
    "Orthodox rabbi;",
    "renowned Sephardic Orthodox rabbi and kabbalist",
    "Chasidic rabbi of the Boston Hasidic dynasty",
    "rabbi and founder of Masorti movement",
    "Orthodox Religious Zionist rabbi and",
    "rabbi of the Western Wall for years",
    "Breslover Hasid and rabbi",
    "chief rabbi of Cincinnati",
    "Orthodox rabbi and posek",
    "settler and chief rabbi",
    "zionist orthodox rabbi",
    "Salvation Army officer",
    "Orthodox Jewish rabbi",
    "rabbi and sect leader",
    "Chabad Chasidic rabbi",
    "chief rabbi of Haifa",
    "Conservative rabbi",
    "rabbi and chaplain",
    "and Orthodox rabbi",
    "Orthodox rabbi and",
    "rabbi and settler",
    "Hasidic rabbi and",
    "Haredi rabbi and",
    "Sephardic rabbi",
    "Orthodox rabbi",
    "orthodox rabbi",
    "Hasidic rabbi",
    "Haredi rabbi",
    "Reform rabbi",
    "Chabad rabbi",
    "reform rabbi",
    "Jewish rabbi",
    "rabbi and",
    "and rabbi",
    "rabbi",
]
social = [
    "Chief Scout Executive of the Boy Scouts of",
    "deputy chief scout of Scouts",
    "scouting leader",
    "scout leader",
    "Chief Scout",  # before politics_govt_law
    "scouter",
    "scout",
]
crime = []
event_record_other = [
    "holocaust survivor",
]
other_species = [
    "rabbit and book subject",  # before spiritual
]

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [214]:
# Combining separate lists into one dictionary
known_for_dict = {
    "arts": arts,
    "sports": sports,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [223]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 3min 6s
Wall time: 3min 6s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
90276,24,Habibullah Siraji,", 72, Bangladeshi poet, director general of the Bangla Academy .",https://en.wikipedia.org/wiki/Habibullah_Siraji,11,2021,May,since,,,director general of the Bangla Academy,,,,,,,,,72.0,,Bangladesh,,since 2018,2.484907,0,0,0,0,0,1,0,0,0,0,0,0,1
85780,7,Dawn Lindberg,", 75, South African folk singer, actress and theatre producer, COVID-19.",https://en.wikipedia.org/wiki/Dawn_Lindberg,9,2020,December,,,,actress and theatre producer,COVID,,,,,,,,75.0,,South Africa,,,2.302585,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [224]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 20231 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [333]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [332]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [331]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "bursar" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [330]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [329]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "admiral and Black Rod"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [335]:
# Creating lists for each category
politics_govt_law = [
    "governor of the North West Frontier Province",
    "marijuana advocate",
    "language proponent",
    "FEMA director",
    "political figure and a dissident",
    "and political figure",
    "political figure and ",
    "political figure",
    "and public health whistleblower",
    "congressional aide and oil industry whistleblower",
    "nuclear espionage whistleblower",
    "government whistleblower",
    "and whistleblower",
    "whistleblower",
    "clan chief",
    "Royal courtier",
    "courtier and",
    "and courtier",
    "courtier",
    "Master of the Household to the Sovereign",
    "Marshal of the Diplomatic Corps",
    "colonial administrator and Governor of Northern",
    "and a colonial administrator",
    "and colonial administrator",
    "colonial administrator",
    "Governor of Western",
    "and Black Rod",
    "Black Rod",
]

arts = [
    "jazz tenor saxophonist and musical arranger",
    "jazz tenor saxophonist and jazz club owner",
    "hard bop and post bop tenor saxophonist",
    "jazz alto saxophonist and clarinetist",
    "jazz tenor saxophonist and flautist",
    "jazz and big band tenor saxophonist",
    "jazz saxophonist and musicologist",
    "saxophonist and free jazz pioneer",
    "jazz saxophonist and clarinetist",
    "jazz saxophonist and band leader",
    "Grammy Award winning saxophonist",
    "alto saxophonist and clarinetist",
    "hard bop jazz tenor saxophonist",
    "record producer and saxophonist",
    "jazz saxophonist and bandleader",
    "jazz tenor and alto saxophonist",
    "jazz saxophonist and flautist",
    "jazz saxophonist and flutist",
    "saxophonist and clarinetist",
    "jazz and blues saxophonist",
    "jazz baritone saxophonist",
    "Hall of Fame saxophonist",
    "saxophonist and flautist",
    "jazz and R&B saxophonist",
    "saxophonist and arranger",
    "funk and R&B saxophonist",
    "saxophonist and vocalist",
    "flautist and saxophonist",
    "jazz tenor saxophonist",
    "jazz alto saxophonist",
    "broadcasting adviser",
    "street saxophonist",
    "tenor saxophonist",
    "blues saxophonist",
    "jazz saxophonist",
    "alto saxophonist",
    "rock saxophonist",
    "R&B saxophonist",
    "saxophonist and",
    "saxophonist",
    "clown and children television personality",
    "clown and son of Emmett Kelly",
    "clown and television host",
    "clown and circus director",
    "baseball clown",
    "rodeo clown",
    "and clown",
    "clown",
    "television producer and media executive",
    "Emmy Award winning television producer",
    "motion picture and television producer",
    "television producer and music director",
    "game show host and television producer",
    "television producer and script editor",
    "television producer and talent agent",
    "entertainer and television producer",
    "television producer and executive",
    "puppeteer and television producer",
    "television producer and director",
    "caster and television producer",
    "radio and television producer",
    "television producer and host",
    "and television producer",
    "television producer and",
    "television producer",
]
sports = [
    "basketball coach of the University of Tennessee Volunteers",
    "Hall of Fame college basketball coach and administrator",
    "college basketball coach and athletic administrator",
    "college basketball coach for Wake Forest University",
    "college basketball coach and athletic director",
    "basketball coach for the Harlem Globetrotters",
    "NBA player and high school basketball coach",
    "professional basketball coach and executive",
    "basketball coach and athletic administrator",
    "women basketball coach at Military Academy",
    "Hall of Fame high school basketball coach",
    "basketball coach and athletics director",
    "Hall of Fame college basketball coach",
    "basketball coach and baseball coach",
    "college basketball coach and player",
    "basketball coach and executive",
    "Hall of Fame basketball coach",
    "football and basketball coach",
    "baseball and basketball coach",
    "high school basketball coach",
    "basketball coach and referee",
    "college basketball coach and",
    "tennis and basketball coach",
    "basketball coach and player",
    "college basketball coach",
    "women basketball coach",
    "NBA basketball coach",
    "and basketball coach",
    "basketball coach",
    "Olympic water polo player and swimmer",
    "Olympic water polo player and",
    "Olympic water polo player",
    "wilderness guide",
    "fly fisherman",
    "Olympic bronze medal winning equestrian",
    "equestrian and Olympic champion",
    "javelin thrower and equestrian",
    "equestrian at the Asian Games",
    "Olympic champion equestrian",
    "national equestrian coach",
    "Olympic equestrian rider",
    "Hall of Fame equestrian",
    "Olympic equestrian and",
    "paralympic equestrian",
    "Paralympic equestrian",
    "equestrian competitor",
    "dressage equestrian",
    "Olympic equestrian",
    "equestrian eventer",
    "equestrian and",
    "equestrian",
    "hurler and hurling manager",
    "and hurler",
    "hurler ·",
    "hurler",
]
sciences = [
    "psychiatrist specializing in psychic phenomena",
    "psychiatrist and developer of reality therapy",
    "psychiatrist and reincarnation researcher",
    "psychiatrist and pioneer LSD experimenter",
    "psychiatrist and Tourette syndrome expert",
    "child psychiatrist and Jungian analyst",
    "psychiatrist and addiction specialist",
    "psychiatrist and science communicator",
    "psychiatrist and student of Carl Jung",
    "psychiatrist and research director",
    "psychiatrist and dream researcher",
    "psychiatrist and sleep researcher",
    "psychiatrist and psychotherapist",
    "psychiatrist and psychoanalyst",
    "neurologist and psychiatrist",
    "psychiatrist and neurologist",
    "sexologist and psychiatrist",
    "orthomolecular psychiatrist",
    "psychiatrist and researcher",
    "internist and psychiatrist",
    "developmental psychiatrist",
    "experimental psychiatrist",
    "psychiatrist known as the",
    "psychiatrist and surgeon",
    "forensic psychiatrist",
    "child psychiatrist",
    "neuropsychiatrist",
    "psychiatrist and",
    "and psychiatrist",
    "psychiatrist",
    "ornithologist and naturalist",
    "naturalist and",
    "and naturalist",
    "naturalist",
    "nursing researcher",
]

business_farming = []
academia_humanities = ["and philatelist", "philatelist", "and bursar", "bursar"]
law_enf_military_operator = [
    "army colonel and",
    "and army colonel",
    "army colonel",
    "senior army officer and Commissioner of the Federal Police",
    "World War II army officer and Military Cross recipient",
    "army officer and World War II prisoner of war escapee",
    "WWII army officer and Military Cross recipient",
    "army officer and intelligence analyst",
    "army officer and World War II veteran",
    "army officer and Chief of Staff",
    "World War II army officer",
    "military  army officer",
    "WWII army officer",
    "and army officer",
    "army officer and",
    "army officer",
    "retired Navy vice admiral and Pentagon official",
    "Navy vice admiral and Medal of Honor recipient",
    "four star admiral and World War II aviator",
    "admiral who was Commander in Chief in the",
    "World War II veteran and rear admiral",
    "vice admiral and naval secretary",
    "admiral and Chief of Naval Staff",
    "admiral and resistance fighter",
    "four star admiral in the Navy",
    "admiral and naval aviator",
    "naval rear admiral",
    "Navy vice admiral",
    "Navy rear admiral",
    "navy vice admiral",
    "four star admiral",
    "navy rear admiral",
    "vice admiral and",
    "vice admiral",
    "Navy admiral",
    "rear admiral",
    "navy admiral",
    "admiral and",
    "and admiral",
    "admiral",
]
spiritual = [
    "Buddhist Sangharaja and Nobel Peace Prize nominee",
    "meditation master and Buddhist monk",
    "Buddhist spiritual leader",
    "Theravada Buddhist monk",
    "Sinhalese Buddhist monk",
    "Mahayana Buddhist monk",
    "Tuvan Buddhist lama",
    "Buddhist Zen master",
    "Zen Buddhist priest",
    "Buddhist missionary",
    "Zen Buddhist monk",
    "Buddhist monk and",
    "Buddhist prelate",
    "Buddhist clergy",
    "Buddhist abbess",
    "Buddhist leader",
    "Buddhist priest",
    "Buddhist monk",
    "Buddhist  nun",
    "Shin Buddhist",
    "Buddhist Lama",
    "Buddhist nun",
    "and Buddhist",
    "Buddhist",
    "member of the Church of Scientology",
    "and critic of Scientology",
    "Scientology",
]
social = [
    "humanitarian and",
    "and humanitarian",
    "humanitarian",
    "aid worker credited who saved over Jewish children during World War II",
    "World Health Organization aid worker",
    "aid worker and",
    "and aid worker",
    "aid worker",
]
crime = []
event_record_other = [
    "killing spree victim",
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [336]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [337]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 4min 7s
Wall time: 4min 8s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
37744,14,Ray Barlow,", 85, English footballer .",https://en.wikipedia.org/wiki/Ray_Barlow,9,2012,March,West Bromwich Albion,,,,,,,,,,,,85.0,,United Kingdom of Great Britain and Northern Ireland,,West Bromwich Albion,2.302585,0,0,0,0,0,0,1,0,0,0,0,0,1
44008,10,Walter McCaffrey,", 64, American activist and politician , complications from a traffic collision.",https://en.wikipedia.org/wiki/Walter_McCaffrey,3,2013,July,"City Council,",,,complications from a traffic collision,,,,,,,,,64.0,,United States of America,United States of America,"City Council, 1985 2001",1.386294,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [338]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 19018 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [412]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [411]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [410]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "turned informant" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [409]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [408]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "criminal law"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [413]:
# Creating lists for each category
politics_govt_law = [
    "criminal defendant diagnosed with multiple personality disorder",
    "World War II conscientious objector  with War Resisters League",
    "fascist leader of the Independent State of in World War II",
    "private secretary of Adolf Hitler during World War II",
    "campaigner for safety glass",
    "criminal defense attorney",  # before crime
    "Health Minister in",
    "figure and civic leader",
    "anti war campaigner",
    "civic leader and ",
    "and civic leader",
    "heir and member",
    "criminal law",
    "civic leader",
]

arts = [
    "radio and TV presenter",
    "TV presenter of MTV",
    "and TV presenter",
    "TV presenter",
]
sports = [
    "disability sport administrator",
    "Olympic gold medal winning marathon runner",
    "marathon runner and athletics coach",
    "and marathon runner",
    "Olympic marathon runner",
    "ultramarathon runner",
    "marathon runner",
    "owner of Coolmore Stud",
]
sciences = [
    "orthopaedic surgeon and pioneer in combatting polio throughusing penicillin",
    "veterinary surgeon who was the inspiration for",
    "surgeon and ulcerative colitis surgery pioneer",
    "surgeon and inventor of parenteral nutrition",
    "cardiovascular surgeon and medical pioneer",
    "surgeon and vascular surgery pioneer",
    "medical practitioner and surgeon",
    "podiatric surgeon and ufologist",
    "and renowned plastic surgeon",
    "pediatric orthopedic surgeon",
    "medical doctor and surgeon",
    "consultant cardiac surgeon",
    "oral and plastic surgeon",
    "paediatric neurosurgeon",
    "cardio thoracic surgeon",
    "cardiovascular surgeon",
    "surgeon and oncologist",
    "hepatobiliary surgeon",
    "World War II surgeon",
    "college  and surgeon",
    "plastic surgeon and",
    "cardiac surgeon and",
    "orthopaedic surgeon",
    "orthopedic surgeon",
    "transplant surgeon",
    "doctor and surgeon",
    "veterinary surgeon",
    "military surgeon",
    "vascular surgeon",
    "thoracic surgeon",
    "neurosurgeon and",
    "cardiac surgeon",
    "plastic surgeon",
    "pioneer surgeon",
    "trauma surgeon",
    "cancer surgeon",
    "dental surgeon",
    "naval surgeon",
    "neurosurgeon",
    "hand surgeon",
    "Army surgeon",
    "eye surgeon",
    "war surgeon",
    "surgeon and",
    "and surgeon",
    "surgeon",
    "alleged doctor and",
    "alleged doctor",
]

business_farming = []
academia_humanities = [
    "founder of St Antony College",
]
law_enf_military_operator = [
    "Army Air Forces airman and recipient of the Medal of Honor for his actions in World War II",
    "fighter ace during World War II and recipient of the Knight Cross of the Iron Cross",
    "fighter ace and recipient of the Knight Cross of the Iron Cross during World War II",
    "flying ace and recipient of the Knight Cross of the Iron Cross during World War II",
    "flying ace during World War II and recipient of the Knight Cross of the Iron Cross",
    "World War II veteran and hero of the battle for the Hurtgen Forest on November",
    "nazi collaborator during World War II and founding member of Front National",
    "military aviator and member of the Tuskegee Airmen during World War II",
    "SS officer and Auschwitz concentration camp doctor during World War II",
    "Navy submarine commander and Medal of Honor recipient in World War II",
    "submarine commander awarded the Medal of Honor during World War II",
    "World War II dissident who led Jews over the Pyrenees to freedom",
    "who save hundreds of Jews from the Holocaust during World War II",
    "pilot during World War II and later an officer in the Air Force",
    "pilot and navigator during World War II and Hero of the Union",
    "flying ace during World War II and record setting test pilot",
    "Air Force officer and squadron commander during World War II",
    "intelligence officer and SOE operative during World War II",
    "air marshal and an ace nightfighter pilot in World War II",
    "Medal of Honor recipient for actions during World War II",
    "Navy officer and destroyer commander during World War II",
    "naval officer and U boat commander during World War II",
    "communist leader of the Resistance during World War II",
    "officer and Colditz Castle escapee during World War II",
    "World War II veteran and recipient of the Silver Star",
    "Air Force officer and flying ace during World War II",
    "fighter pilot who served in the during World War II",
    "Air Defence Forces officer and World War II veteran",
    "rifleman with the Marine Corps during World War II",
    "U boat commander of the sunken during World War II",
    "Air Force pilot and flying ace during World War II",
    "World War II veteran and Medal of Honor recipient",
    "Navy officer and World War II submarine commander",
    "Waffen SS member and official during World War II",
    "ace during World War II and Iron Cross recipient",
    "fighter pilot and flying ace during World War II",
    "sniper during World War II and Hero of the Union",
    "flying ace during the Civil War and World War II",
    "y commander in the Waffen SS during World War II",
    "fighter ace of the Air Force during World War II",
    "Special Operations Executive during World War II",
    "World War II commanding officer of Easy Company",
    "planner of the Great Escape during World War II",
    "Imperial Army World War II intelligence officer",
    "SS officer and war criminal during World War II",
    "commander of the Waffen SS during World War II",
    "RAF officer and flying ace during World War II",
    "Air marshal and flying ace during World War II",
    "Army Air Forces flying ace during World War II",
    "World War II Jewish  fighter and anti avenger",
    "World War II ambulance driver and interpreter",
    "fighter pilot during World War II and the War",
    "Air Force officer and World War II flying ace",
    "member of the Resistance during World War II",
    "World War II veteran acquitted of war crimes",
    "Navy submarine commander during World War II",
    "bomb and mine specialist during World War II",
    "commander and flying ace during World War II",
    "Marine Corps infantryman during World War II",
    "Vice Admiral in the Navy during World War II",
    "officer in the Wehrmacht during World War II",
    "Navajo code talker and World War II veteran",
    "U boat commander in the during World War II",
    "military volunteer and World War II veteran",
    "and pilot and navigator during World War II",
    "leader of the Bielski s during World War II",
    "fighter ace of the RAAF during World War II",
    "World War II Tuskegee Airman fighter pilot",
    "Airforce Service pilot during World War II",
    "officer and flying ace during World War II",
    "Navajo prisoner of war during World War II",
    "Navy dive bomber pilot during World War II",
    "Navy officer during and after World War II",
    "RAF fighter pilot during World War II and",
    "and Chetnik commander during World War II",
    "air marshal and World War II bomber pilot",
    "Air Forces flying ace during World War II",
    "World War II fighter pilot and flying ace",
    "Oberstleutnant in the during World War II",
    "decorated bomber ace during World War II",
    "Seminole Code Talker during World War II",
    "Air Force flying ace during World War II",
    "and anti resisister during World War II",
    "and prisoner of war during World War II",
    "night fighter pilot during World War II",
    "codebreaker at Park during World War II",
    "submarine commander during World War II",
    "resistance fighter during World War II",
    "World War II bomber pilot and war hero",
    "military commander during World War II",
    "Wehrmacht officer during World War II",
    "World War II Special Operations agent",
    "Royal Marine Commando in World War II",
    "Resistance member during World War II",
    "flight lieutenant during World War II",
    "World War II Medal of Honor recipient",
    "World War II non commissioned officer",
    "war correspondent during World War II",
    "paratroop officer during World War II",
    "night fighter ace during World War II",
    "resistance leader during World War II",
    "U boat commander during World War II",
    "World War II anti Hitler conspirator",
    "military frogman during World War II",
    "flight navigator during World War II",
    "Resistance agent during World War II",
    "of the Waffen SS during World War II",
    "medical orderly during World War II",
    "World War II fighter and test pilot",
    "nazi camp guard during World War II",
    "freedom fighter during World War II",
    "communist and World War II  fighter",
    "test pilot and World War II veteran",
    "World War II veteran and fundraiser",
    "and SS captain during World War II",
    "highly decorated World War II hero",
    "Navajo code talker in World War II",
    "World War II and War fighter pilot",
    "military  during World War II and",
    "fighter pilot during World War II",
    "naval officer during World War II",
    "World War II Secret Service agent",
    "y fighter ace during World War II",
    "World War II Spitfire fighter ace",
    "pilot and World War II flying ace",
    "naval aviator during World War II",
    "Commando during World War II and",
    "collaborator during World War II",
    "SS commander during World War II",
    "distinguished World War II pilot",
    "pilot in the during World War II",
    "U boat commander in World War II",
    "bomber pilot during World War II",
    "submariner and World War II hero",
    "fighter ace during World War II",
    "World War II resistance fighter",
    "codebreaker during World War II",
    "field medic during World War II",
    "World War II nightfighter pilot",
    "Navajo World War II code talker",
    "World War II fighter pilot and",
    "and World War II fighter pilot",
    "flying ace during World War II",
    "World War II Resistance member",
    "SS officer during World War II",
    "World War II resistance leader",
    "World War II resistance worker",
    "WASP pilot during World War II",
    "World War II Waffen SS officer",
    "pilot and World War II veteran",
    "World War II air force officer",
    "World War II spy for the Union",
    "World War II Navy fighter ace",
    "SOE agent during World War II",
    "officer and World War II hero",
    "SEO agent during World War II",
    "World War II RAF airman and ",
    "World War II Air Force pilot",
    "World War II veteran and war",
    "and World War II flying ace",
    "fighter ace in World War II",
    "World War II military pilot",
    "World War II tank commander",
    "naval  World War II veteran",
    "World War II Panzer captain",
    "y pilot during World War II",
    "officer during World War II",
    "World War II fighter pilot",
    "World War II naval officer",
    "World War II cryptographer",
    "pilot during World War II",
    "World War II glider pilot",
    "Army  during World War II",
    "World War II veteran and",
    "and World War II veteran",
    "and World War II veteran",
    "World War II fighter ace",
    "World War II codebreaker",
    "World War II RAF officer",
    "World War II paratrooper",
    "World War II flying ace",
    "spy during World War II",
    "World War II air gunner",
    "officer in World War II",
    "World War II RAF airman",
    "World War II submariner",
    "World War II Flying ace",
    "World War II navigator",
    "World War II pilot ace",
    "World War II combatant",
    "World War II hero and",
    "and World War II hero",
    "World War II commando",
    "World War II spy and",
    "nun and World War II",
    "World War II veteran",
    "World War II General",
    "World War II officer",
    "World War II aviator",
    "World War II air ace",
    "World War II airman",
    "World War II marine",
    "World War II during",
    "during World War II",
    "World War II pilot",
    "World War II Army",
    "World War II ace",
    "turned informant",
    "World War II",
    "criminalist",  # before crime
]


spiritual = [
    "evangelical preacher and missionary",
    "Catholic bishop and missionary",
    "Catholic missionary and bishop",
    "Jesuit priest and missionary",
    "Mormon leader and missionary",
    "pastor and missionary",
    "priest and missionary",
    "Christian missionary",
    "Catholic missionary",
    "Salesian missionary",
    "catholic missionary",
    "Jesuit missionary",
    "Mormon missionary",
    "missionary and",
    "and missionary",
    "missionary in",
    "missionary",
]
social = []
crime = [
    "criminal and inmate of Alcatraz Penitentiary",
    "member of the Camorra criminal organisation",
    "criminal and twin brother of Reggie Kray",
    "SS officer and war criminal",
    "suspected war criminal",
    "criminal and fugitive",
    "criminal and smuggler",
    "criminal and gangster",
    "alleged war criminal",
    "computer criminal",
    "pardoned criminal",
    "Serb war criminal",
    "serial criminal",
    "cyber criminal",
    "war criminal",
    "and criminal",
    "criminal and",
    "criminalist",
    "criminal",
]
event_record_other = [
    "Siberian gulag survivor",
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [415]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
    "crime": crime,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [416]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 4min 42s
Wall time: 4min 42s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
94426,4,Mohammed Inuwa Wushishi,", 81, Nigerian general, chief of Army staff .",https://en.wikipedia.org/wiki/Mohammed_Inuwa_Wushishi,8,2021,December,,,,chief of Army staff,,,,,,,,,81.0,,Nigeria,,1981 1983,2.197225,0,0,0,0,0,0,0,1,0,0,0,0,1
4778,29,Robert Levin,", 84, Norwegian classical pianist and composer.",https://en.wikipedia.org/wiki/Robert_Levin_(Norwegian_pianist),13,1996,October,,,,,,,,,,,,,84.0,,Norway,,,2.639057,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [417]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 18400 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [527]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [526]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [525]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "and later a nationalist" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [523]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [522]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "social worker and later a nationalist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [520]:
# Creating lists for each category
politics_govt_law = [
    "attorney and one of the first female District Judges",
    "attorney and Associate Justice of the Supreme Court",
    "disability rights advocate and attorney",
    "attorney and political campaign manager",
    "attorney who represented James Earl Ray",
    "civil rights attorney and Circuit Judge",
    "attorney and mentor to Erin Brockovich",
    "entertainment attorney and litigator",
    "attorney and death penalty advocate",
    "attorney and civil rights advocate",
    "attorney and intelligence expert",
    "attorney and bonsai enthusiast",
    "attorney and tax policy expert",
    "state legislator and attorney",
    "attorney and defense expert",
    "disability rights attorney",
    "real estate attorney and",
    "attorney and billionaire",
    "First Amendment attorney",
    "civil rights attorney",
    "prosecuting attorney",
    "defense attorney",
    "labor attorney",
    "and attorney",
    "attorney and",
    "attorney",
    "neoconservative advocate and",
    "traditionalist conservative",
    "conservative campaigner",
    "conservative",
    "spokesman in parliament",
    "Politician",
    "State propagandist",
    "Baloch nationalist leader",
    "black nationalist leader",
    "and later a nationalist",
    "and Islamist",
    "Islamist",
    "coupist",
    "far left",
    "neo nazi and",
    "neo nazi",
    "nationalist",
]

arts = [
    "fashion model and beauty queen",
    "beauty queen and st Miss",
    "model and beauty queen",
    "child beauty queen",
    "beauty queen and",
    "and beauty queen",
    "beauty queen",
    "newspaper columnist for the San Francisco Chronicle",
    "newspaper columnist and radio television critic",
    "television personality and magazine columnist",
    "crossword compiler and advice columnist",
    "newspaper correspondent and columnist",
    "columnist and television personality",
    "advice columnist and media celebrity",
    "syndicated gossip columnist for the",
    "newspaper columnist and humorist",
    "Pulitzer Prize winning columnist",
    "columnist and newspaper director",
    "syndicated newspaper columnist",
    "blogger and magazine columnist",
    "editor and humor columnist",
    "and newspaper columnist",
    "entertainment columnist",
    "humorist and columnist",
    "columnist and blogger",
    "newspaper columnist",
    "magazine columnist",
    "tabloid columnist",
    "gossip columnist",
    "advice columnist",
    "humor columnist",
    "and columnist",
    "columnist",
    "record producer and member of the Country Music Hall of Fame",
    "record producer who discovered Sinéad O'Connor and The Pale",
    "record producer and production company executive",
    "Hall of Fame music executive and record producer",
    "record producer and founder of Barclay Records",
    "two time Grammy Award winning record producer",
    "record producer and member of Dschinghis Khan",
    "record producer and record company executive",
    "record company executive and record producer",
    "multi instrumentalist and record producer",
    "jazz music impresario and record producer",
    "radio station owner and record producer",
    "jazz double bassist and record producer",
    "record producer and record label owner",
    "record producer and company executive",
    "Grammy Award winning record producer",
    "record producer and musical producer",
    "jazz trombonist and record producer",
    "defence analyst and record producer",
    "record producer and music executive",
    "record producer and band manager",
    "disc jockey and record producer",
    "bass player and record producer",
    "hip hop record producer and DJ",
    "hip hop record producer and MC",
    "record producer and executive",
    "Hall of Fame record producer",
    "record producer and promoter",
    "record producer and arranger",
    "record producer and lyricist",
    "jazz and R&B record producer",
    "bassist and record producer",
    "record producer and manager",
    "manager and record producer",
    "punk rock record producer",
    "hip hop record producer",
    "DJ and record producer",
    "record producer and DJ",
    "jazz record producer",
    "R&B record producer",
    "and record producer",
    "record producer and",
    "record producer",
    "harmonium and tabla player",
    "and lutenist",
    "lutenist",
]
sports = [
    "long distance swimmer and first woman to swim the Channel in both directions",
    "Olympic silver and bronze medal winning swimmer",
    "competition swimmer and Olympic silver medalist",
    "Olympic gold and silver medal winning swimmer",
    "swimmer and first Olympic swimming medallist",
    "swimmer and first woman to swim the Channel",
    "backstroke swimmer and world record holder",
    "triple gold medal winning Olympic swimmer",
    "Paralympian swimmer and wheelchair racer",
    "backstroke swimmer and Olympic champion",
    "Olympic swimmer and water polo player",
    "water polo player and Olympic swimmer",
    "Olympic silver medal winning swimmer",
    "swimmer and Olympic bronze medalist",
    "Olympic gold medal winning swimmer",
    "long distance swimmer and Olympian",
    "Olympic swimmer and swimming coach",
    "Olympic champion freestyle swimmer",
    "Olympic swimmer and national coach",
    "swimmer and Olympic gold medalist",
    "Olympic silver medalist swimmer",
    "swimmer ad world record holder",
    "Olympic long distance swimmer",
    "swimmer and Olympic champion",
    "swimmer and Olympic medalist",
    "Olympic breaststroke swimmer",
    "Olympic synchronised swimmer",
    "swimmer and swimming coach",
    "Olympic backstroke swimmer",
    "Olympic champion swimmer",
    "long distance swimmer",
    "Hall of Fame swimmer",
    "swimmer and Olympian",
    "Olympic gold swimmer",
    "breaststroke swimmer",
    "competition swimmer",
    "competitive swimmer",
    "paralympian swimmer",
    "Paralympic swimmer",
    "backstroke swimmer",
    "freestyle swimmer",
    "swimmer and coach",
    "butterfly swimmer",
    "swimmer and diver",
    "marathon swimmer",
    "Olympic swimmer",
    "Masters swimmer",
    "era swimmer",
    "swimmer",
]
sciences = [
    "medical practitioner",
]

business_farming = [
    "gambling tycoon",
]
academia_humanities = [
    "grammarian",
    "musicologist and expert on Erik Satie",
    "ethnomusicologist and musicologist",
    "folklorist and enthomusicologist",
    "theorist and musicologist",
    "musicologist and ethnographer",
    "hymnologist and musicologist",
    "musicologist and folklorist",
    "folklorist and musicologist",
    "librarian and musicologist",
    "and ethnomusicologist",
    "ethnomusicologist and",
    "ethnomusicologist",
    "and musicologist",
    "musicologist and",
    "musicologist",
    "Islamicist",  # before spiritual
    "Scholer and",
]
law_enf_military_operator = [
    "militant leader of the Vilayat Dagestan",
    "militant and bodyguard",
    "and militant",
    "revolutionary and  militant",
    "militant in al Qaeda wing",
    "independentist militant",
    "independence militant",
    "nationalist militant",
    "extremist militant",
    "Montoneros militant",
    "militant separatist",
    "militant commander",
    "Lehi militant and",
    "Taliban militant",
    "militant leader",
    "ISIS militant",
    "pro militant",
    "and militant",
    "militant",
    "and leader of the Boricua Popular Army",
    "and leader of the Grey Wolves",
]
spiritual = [
    "tarot card reader",
    "Catholic cardinal and former archbishop of Manila",
    "Catholic cardinal and archbishop",
    "Coptic Catholic cardinal",
    "Catholic cardinal",
    "Islamic  religious leader",
    "Islamic spiritual leader",
    "Arabian imam and Islamic",
    "Islamic religious leader",
    "Arabian Islamic cleric",
    "and Islamic preacher",
    "Shi'a Islamic leader",
    "Tatar Islamic cleric",
    "Islamic Sufi leader",
    "Islamic leader and",
    "Islamic preacher",
    "Arabian Islamic",
    "Islamic  cleric",
    "Islamic science",
    "Islamic cleric",
    "Islamic leader",
    "Islamic legal",
    "Islamic",
    "Sunni Muslim  mufti",
    "Sunni Arab cleric",
    "Sunni Muslim",
    "Sunni Islam",
    "Sunni",
]
social = []
crime = [
    "terrorist and bomb maker",
    "stock market fraudster",
    "fraudster and kidnapper",
    "suspected fraudster",
    "game show fraudster",
    "fraudster",
]
event_record_other = [
    "civilian kidnapped and murdered by militants in the West Bank city of Ramallah",  # before law_enfor_military_operator
    "Islamic jihad hostage",  # before spiritual
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [521]:
# Combining separate lists into one dictionary
known_for_dict = {
    "event_record_other": event_record_other,
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [418]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "general" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean11.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean.db")
# df.to_sql("wp_life_expect_clean", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 11]()