# Wikipedia Notable Life Expectancies
# [Notebook  14: Data Cleaning Part 13](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean13_thanak_2022_08_07.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean12.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean12", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98056 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98054,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98055,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
31734,21,Russell Ash,", 64, British writer and publisher .",https://en.wikipedia.org/wiki/Russell_Ash,3,2010,June,,,,,,,,,,,,,64.0,,United Kingdom of Great Britain and Northern Ireland,,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1
65977,6,Karin Dor,", 79, German actress .",https://en.wikipedia.org/wiki/Karin_Dor,6,2017,November,", ,",,,,,,,,,,,,79.0,,Germany,,", ,",1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
83593,15,Dick Coury,", 90, American football coach .",https://en.wikipedia.org/wiki/Dick_Coury,11,2020,August,Philadelphia Eagles,,,,,,,,,,,,90.0,,United States of America,,Philadelphia Eagles,2.484907,0,0,0,0,0,0,1,0,0,0,0,0,1
80426,1,Jaxon Buell,", 5, American microhydranencephaly victim.",https://en.wikipedia.org/wiki/Jaxon_Buell,4,2020,April,,,,,,,,,,,,,5.0,,United States of America,,,1.609438,0,0,0,0,0,0,0,0,0,0,1,0,1
90486,4,Kalipatnam Ramarao,", 96, Indian poet and writer.",https://en.wikipedia.org/wiki/Kalipatnam_Ramarao,7,2021,June,,,,,,,,,,,,,96.0,,India,,,2.079442,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98056 entries, 0 to 98055
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98056 non-null  object 
 1   name                       98056 non-null  object 
 2   info                       98056 non-null  object 
 3   link                       98056 non-null  object 
 4   num_references             98056 non-null  int64  
 5   year                       98056 non-null  int64  
 6   month                      98056 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     23 non-null     object 
 9   info_2                     98024 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# Code to check each value
roles_list.pop()

''

<IPython.core.display.Javascript object>

In [8]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [
            index
            for index in df[df["info_2"].notna()].index
            if "nazi" in df.loc[index, "info_2"]
        ],
        "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [9]:
# Viewing list sorted by descending length to copy to dictionary below and screen values
sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

['nazi eugenicist', 'nazi']

<IPython.core.display.Javascript object>

In [10]:
# Example code to quick-check a specific entry
df[df["info_2"] == "nazi"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
5177,21,Hans Egon Holthusen,", 83, German nazi, writer and academic.",https://en.wikipedia.org/wiki/Hans_Egon_Holthusen,4,1997,January,,,nazi,writer and academic,,,,,,,,,83.0,,Germany,,,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0
7609,15,Gunter d'Alquen,", 87, German nazi correspondent.",https://en.wikipedia.org/wiki/Gunter_d%27Alquen,6,1998,May,,,nazi,,,,,,,,,,87.0,,Germany,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
11847,28,Heinrich Schmidt,", 88, German nazi physician.",https://en.wikipedia.org/wiki/Heinrich_Schmidt_(SS_doctor),5,2000,November,,,nazi,,,,,,,,,,88.0,,Germany,,,1.791759,1,0,0,0,0,0,0,0,0,0,0,0,1
12948,18,Karl Friedrich Titho,", 90, German nazi criminal.",https://en.wikipedia.org/wiki/Karl_Friedrich_Titho,5,2001,June,,,nazi,,,,,,,,,,90.0,,Germany,,,1.791759,0,0,0,0,0,0,0,0,0,1,0,0,1


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [11]:
# Creating lists for each category
politics_govt_law = [
    "advocate for raising awareness of epidermolysis bullosa",
    "advocate for the homeless and mentally ill",
    "CDC spokesperson and anti tobacco advocate",
    "advocate for women and transgender rights",
    "atheism and reproductive rights advocate",
    "advocate for the rights of prostitutes",
    "advocate for women and migrant workers",
    "cystic fibrosis assistance advocate",
    "councillor and community advocate",
    "breast cancer awareness advocate",
    "advocate of freedom of the press",
    "migrant workers' rights advocate",
    "and anti death penalty advocate",
    "and mental health care advocate",
    "disabled people rights advocate",
    "advocate for disability rights",
    "and indigenous rights advocate",
    "nuclear arms control advocate",
    "advocate for homeless rights",
    "transgender rights advocate",
    "consumer and women advocate",
    "endangered species advocate",
    "disability rights advocate",
    "prisoners' rights advocate",
    "indigenous rights advocate",
    "renewable energy advocate",
    "Rohingya rights advocate",
    "workers' rights advocate",
    "Tolowa cultural advocate",
    "consumer rights advocate",
    "women equality advocate",
    "social justice advocate",
    "cancer patient advocate",
    "women literacy advocate",
    "migrant worker advocate",
    "patient rights advocate",
    "and Men rights advocate",
    "animal rights advocate",
    "mental health advocate",
    "tribal rights advocate",
    "independence advocate",
    "advocate for the deaf",
    "anti smoking advocate",
    "euthanasia advocate",
    "insurance advocate",
    "advocate for Jews",
    "cycling advocate",
    "health advocate",
    "rights advocate",
    "autism advocate",
    "women advocate",
    "peace advocate",
    "LGBT advocate",
    "advocate",
    "chairwoman of the Nottawaseppi Huron Band of Potawatomi since",
    "first woman Treasurer of the",
    "trans woman pioneer",
    "Warumungu woman",
    "stateswoman",
    "acting Chief Justice of the Supreme Court of",  # before arts
    "nazi eugenicist",
    "nazi",
]

arts = [
    "traditional waka builder",
    "theatrical set builder",
    "shoemaker and bootmaker",
    "traditional cheesemaker",
    "guitar equipment maker",
    "waterfowl decoy maker",
    "and furniture maker",
    "MC and track maker",
    "paper doll maker",
    "organ maker and",
    "woodwind maker",
    "recorder maker",
    "bootmaker and",
    "bagpipe maker",
    "guitar maker",
    "cabinetmaker",
    "cheesemaker",
    "print maker",
    "knife maker",
    "movie maker",
    "screenmaker",
    "watchmaker",
    "kite maker",
    "glassmaker",
    "moviemaker",
    "sign maker",
    "shoemaker",
    "toy maker",
    "dollmaker",
    "trans woman and performer",
    "anchorwoman",
    "and acting coach",
    "acting coach",
    "acting",
]

sports = [
    "builder and team owner",
    "founder of Lola Cars",
    "sharpshooter",
    "racewalker",
    "Hall of Fame track and field coach",
    "track and field coach and",
    "track and field coach",
]
sciences = [
    "Hall of Fame NASCAR engine",
    "race car builder and",
    "yacht and boat builder",
    "framebuilder",
    "frame maker",
    "macrobiotic diet advocate",  # before politics_govt_law
    "privacy",
    "primatologist",
    "optometrist",
    "lichenologist",
]

business_farming = [
    "Chicago area home builder",
    "watch manufacturer",
    "importer",
    "wine maker",
    "gunmaker",
    "carmaker",
    "vegetarianism advocate",  # before politics_govt_law
    "co founder of Nike",
]
academia_humanities = ["polymath"]
law_enf_military_operator = [
    "chief bombmaker of Hamas",
    "chief executioner of Virginia",
    "only woman to serve in the Foreign Legion",
    "servicewoman during WWII",
    "former comfort woman",
    "servicewoman",
    'battalion commander known as "the executioner of Fort Zeelandia"',
    "former commander in chief of the Strategic Air Command",
    "former commander of the Corps of Gendarmerie of City",
    "warlord and commander of Federal Security Service",
    "divisional commander in the Waffen SS during WWII",
    "commander of the Waffen SS during Worls War II",
    "commander of the SAS during WWII",
    "commander of the Liberation Army",
    "commander in the Royal Air Force",
    "Revolutionary Guard commander",
    "former labor camp commander",
    "separatist field commander",
    "Provisional IRA commander",
    "DPR separatist commander",
    "Army brigade commander",
    "rebel field commander",
    "mujahideen commander",
    "air force commander",
    "WWII tank commander",
    "Navy SEAL commander",
    "al Qaeda commander",
    "guerilla commander",
    "militia commander",
    "Taliban commander",
    "U boat commander",
    "rebel commander",
    "field commander",
    "South commander",
    "army commander",
    "Lehi commander",
    "ISIL commander",
    "PIRA commander",
    "FARC commander",
    "IRA commander",
    "SAS commander",
    "commander",
    "field marshal and chief of Army Staff",
    "Army field marshal",
    "field marshal",
    "firefighter",
    "Māori navigator and",
]
spiritual = ["occultist  practitioner", "occultist", "occult", "spiritualist"]
social = [
    "youth movement founder",
    "literacy advocate",  # before politics_govt_law
]
crime = [
    "concentration camp",
]
event_record_other = [
    "homemaker",
    "domestic abuse survivor",
    "Guinness World Record holder for heaviest woman",
    "woman whose adoption led to ethnic riots in in",
    "brain dead woman kept alive to give birth",
    "woman believed to be world oldest mother",
    "woman with rare slow aging condition",
    "heaviest woman in the world",
    "tallest woman in the world",
    "transgender woman",
    "tattooed woman",
    "mauled woman",
]
other_species = []

<IPython.core.display.Javascript object>

In [12]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

In [13]:
# Hard-coding info_2 value for entry to categorize as business_farming
index = df[df["link"] == "https://en.wikipedia.org/wiki/Harry_Henshel"].index
df.loc[index, "info_2"] = "watch manufacturer"

# Hard-coding info_2 value for entry to categorize as crime
index = df[df["link"] == "https://en.wikipedia.org/wiki/Venero_Mangano"].index
df.loc[index, "info_2"] = "mafia"

<IPython.core.display.Javascript object>

In [14]:
# Dropping entry that points to event page rather than individual's page and resetting index
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Death_of_Paolo_Gislimberti"
].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [15]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
    "politics_govt_law": politics_govt_law,
    "arts": arts,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [16]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 34s
Wall time: 1min 34s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
43952,6,Senji Yamaguchi,", 82, Japanese atomic bomb survivor and anti-war activist.",https://en.wikipedia.org/wiki/Senji_Yamaguchi,3,2013,July,Nagasaki,,,,,,,,,,,,82.0,,Japan,,Nagasaki,1.386294,0,0,0,0,0,0,0,0,1,0,0,0,1
64566,22,Haddon Robinson,", 86, American author and academic, interim president of Gordon-Conwell Theological Seminary.",https://en.wikipedia.org/wiki/Haddon_Robinson,7,2017,July,,,,interim president of Gordon Conwell Theological Seminary,,,,,,,,,86.0,,United States of America,,,2.079442,0,0,0,1,0,1,0,0,0,0,0,0,2


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [17]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 2537 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [18]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [19]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [20]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "ace" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [21]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [22]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "race"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [23]:
# Creating lists for each category
politics_govt_law = [
    "politologist",
    "banking regulator",
    "econo",
    "elder",
    "social reformer",
    "and member of n State Duma",
    "claimant to headship of the House of Bourbon Two Sicilies",
    "claimant to the headship of the House of Romanov",
    "labor mediator",
    "plaintiff in landmark 'palimony' lawsuit",
    "international law expert",
    "born constitutional law",
    "constitutional law",
    "international law",
    "Labour Member of Parliament for Dunfermline and West Fife",
    "Minister of Labour",
    "Labour",
    "public",
    "Economics",
    "co founder of Greenpeace",
    "peace",
]

arts = [
    "antique dealer and ologist",
    "horologist",
    "variety performer with his brother Bob as half of Bob and Alf Pearson",
    "hustler and nightclub performer",
    "San Francisco street performer",
    "circus performer and owner",
    "Villu Paatu performer",
    "vaudeville performer",
    "burlesque performer",
    "gamelan performer",
    "circus performer",
    "kabuki performer",
    "Kunqu performer",
    "drag performer",
    "performer",
    "website pioneer of ethnic media in",
    "media proprietor",
    "Cheyenne flutist",
    "concert flutist",
    "flutist",
    "newspaper proprietor",
    "newsreel narrator",
    "newspaper founder",
    "newspaper website",
    "newspaper pioneer",
    "newspaper owner",
    "news cameraman",
    "and newsreader",
    "newspaperman",
    "newsreader",
    "newspaper",
]
sports = [
    "figure skating competitor",
    "and figure skating coach",
    "figure skating trainer",
    "figure skating coach",
    "ice skating trainer",
    "speed skating coach",
    "race caller",
]
sciences = [
    "jet engine pioneer",
    "conchologist and malacologist",
    "taxonomist and carcinologist",
    "pulmonologist and internist",
    "cytologist and phycologist",
    "paediatric rheumatologist",
    "echinodermologist and",
    "vintner and",
    "pomologist and apple",
    "dendrochronologist",
    "seed technologist",
    "food technologist",
    "nanotechnologist",
    "biogerontologist",
    "anesthesiologist",
    "geomorphologist",
    "biotechnologist",
    "helminthologist",
    "phenomenologist",
    "parasitologist",
    "rheumatologist",
    "ophthamologist",
    "gerontologist",
    "pulmonologist",
    "carcinologist",
    "orchidologist",
    "methodologist",
    "suicidologist",
    "malacologist",
    "nematologist",
    "hepatologist",
    "technologist",
    "polemologist",
    "dendrologist",
    "speleologist",
    "limnologist",
    "hydrologist",
    "cosmologist",
    "phycologist",
    "metrologist",
    "leprologist",
    "andrologist",
    "petrologist",
    "audiologist",
    "sexologist",
    "ethologist",
    "oenologist",
    "pedologist",
    "metrician",
    "obstetrician",
    "futurist",
    "software developer",
    "opinion pollster",
    "pollster",
    "physical therapist",
    "family therapist",
    "physiotherapist",
    "sex therapist",
    "therapist",
    "spacesuit technician",
    "space",
]

business_farming = [
    "bookmaker",
    "vineyard owner and vintner",
    "Napa Valley vintner",
    "vintner",
    "duty free retailer",
    "clothing retailer",
    "luxury retailer",
    "retailer",
    "venture capitalist and",
    "venture capitalist",
    "shipowner",
    "pharmaceutical company founder",
    "pharmaceutical",
]
academia_humanities = [
    "museum specialist",
    "Slavist and Balkanologist",
    "Kremlinologist",
    "vexillologist",
    "theatrologist",
    "Assyriologist",
    "Hittitologist",
    "assyriologist",
    "hittitologist",
    "Tibetologist",
    "Albanologist",
    "egyptologist",
    "Japanologist",
    "papyrologist",
    "polemologist",
    "Buddhologist",
    "sumerologist",
    "tibetologist",
    "lexicologist",
    "Iranologist",
    "museologist",
    "patrologist",
    "mythologist",
    "semiologist",
    "Turkologist",
    "anthologist",
    "indologist",
    "logologist",
    "genealogist and heraldist",
    "heraldist",
    "Arabist",
    "mediaevalist",
    "Wikimedian",
    "phonetician",
]
law_enf_military_operator = [
    "cryptologist",
    "operative",
    "Ulster loyalist and UVF member",
    "Ulster loyalist",
    "loyalist",
    "warlord",
    "retired Navy Vice Admiral",
    "Navy Vice Admiral",
    "Vice Admiral",
    "brother in law of Osama bin Laden",
    "law enforcement official",
    "outlaw biker",
    "lawman",
    "member of the Resistance during WW II",
    "member of the Resistance during WWII",
    "a member of the Resistance",
    "member of the Resistance",
    "anti Castro mercenary",
    "mercenary",
    "bodyguard",
    "last surviving World War I fighter ace",
    "fighter ace during World War ||",
    "Air Force major and flying ace",
    "flying ace during World War I",
    "Army Air Forces fighter ace",
    "Army Air Force fighter ace",
    "Army Air Forces flying ace",
    "flying ace of the War",
    "Air Force flying ace",
    "WWII fighter ace",
    "WWII flying ace",
    "War flying ace",
    "Air Force ace",
    "fighter ace",
    "flying ace",
]
spiritual = [
    "demonologist",
    "spiritual guardian of Mount Merapi",
    "faith healer",
    "spiritual guru",
    "Grand Ayatollah",
]
social = []
crime = []
event_record_other = [
    "woman",
    "claimant to the title of world oldest person",
    "unverified claimant for world oldest person",
    "alien abduction claimant",
    "longevity claimant",
    "girl whose severe swimming pool injury led to tougher laws",
    "lawn mower traveler",
    "airplane passenger fatally shot by Air Marshals after allegedly claiming he had placed a bomb aboard",
    "face transplant recipient",
]
other_species = [
    "Hindu sacred bull",
    "oldest living polar bear",
    "grizzly bear",
    "polar bear",
    "tortoise claimant for world oldest animal",
    "Thoroughbred race horse and sire",
    "Thoroughbred race horse",
    "thoroughbred race horse",
    "race horse",
]

<IPython.core.display.Javascript object>

In [24]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [25]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [26]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 49s
Wall time: 1min 49s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
22534,18,Ruth Bernhard,", 101, American photographer, natural causes.",https://en.wikipedia.org/wiki/Ruth_Bernhard,32,2006,December,,,,natural causes,,,,,,,,,101.0,,United States of America,,,3.496508,0,0,0,0,0,1,0,0,0,0,0,0,1
40923,4,Paul Marcotte,", 84, American politician, member of the Kentucky House of Representatives .",https://en.wikipedia.org/wiki/Paul_Marcotte,3,2012,December,,,,member of the House of Representatives,,,,,,,,,84.0,,United States of America,,2005 2007,1.386294,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [27]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 2225 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [28]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [29]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [30]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "car dealer" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [31]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [32]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "milliner"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [33]:
# Creating lists for each category
politics_govt_law = [
    "regulator",
    "law",
    "constitutional",
    "hacker",
    "former press secretary for Richard Nixon during the Watergate Scandal",
    "secretary and mistress of Reichsführer SS Heinrich Himmler",
    "secretary who took Adolf Hitler last will and testament",
    "national security adviser and former foreign secretary",
    "Special Assistant and secretary to John F Kennedy",
    "private secretary to Queen Elizabeth II",
    "first secretary of the Leningrad obkom",
    "secretary of foreign affairs",
    "press secretary",
    "heir to the Sarawakan throne",
    "colonial official",
    "colonial governor",
    "anti colonialist",
    "colonial",
    "Trotskyist and one of the founders of the Revolutionary Socialist League",
    "Trotskyist",
    "international relations",
    "Foreign Service Officer and ambassador to and",
    "ambassador to the United Nations",
    "United Nations ambassador",
    "former ambassador to",
    "Arabian ambassador",
    "ambassador to the",
    "and ambassador",
    "ambassador to",
    "ambassador",
    "detainee in Guantanamo Bay Detention Camp",
    "Arabian Guantanamo Bay detainee",
    "former Guantanamo Bay detainee",
    "ex detainee",
    "detainee",
    "Liberal Democrat Member of Parliament",
    "Conservative Member of Parliament",
    "Member of Parliament",
]

arts = [
    "maker",
    "organ builder",
    "yurt builder",
    "glass engraver",
    "media",
    "opera",
    "fashion house owner",
    "fashion promoter",
    "fashion stylist",
    "fashion pioneer",
    "fashion",
    "ballet master and",
    "ballet master",
    "and ballet",
    "ballet",
    "Māori muralist",
    "muralist",
    "pioneer in visual effects and computer animation",
    "virtuoso harmonica player",
    "harmonica player",
    "SeaWorld trainer",
    "animal trainer",
    "litterateur",
    "pornographer",
    "R&B",
    "textile weaver and dyer",
    "Māori master weaver",
    "tapestry weaver",
    "Tlingit weaver",
    "carpet weaver",
    "basketweaver",
    "weaver",
    "punk impresario and club owner",
    "theatrical impresario",
    "nightclub impresario",
    "nightlife impresario",
    "cabaret impresario",
    "impresario",
    "wildlife cameraman",
    "Reuters cameraman",
    "cameraman",
    "sarod player",
    "hairstylist",
    "classic Broadway star and father of Bonnie Raitt",
    "action movie star",
    "reality TV star",
    "movie star",
    "co star of",
    "porn star",
    "bassoonist",
    "screenwriting guru",
    "screen",
    "milliner",
]
sports = [
    "strength coach",
    "Muay fighter",
    "MMA fighter",
    "Muay master and trainer",
    "harness racing trainer",
    "Thoroughbred trainer",
    "greyhound trainer",
    "fitness trainer",
    "fencing trainer",
    "rowing coxswain",
    "rowing champion",
    "former National Football League player",
    "ex National Football League player",
    "National Football League player",
    "ski mogul",
    "former Football League player and NHL referee",
    "former National Football League player",
    "ex National Football League player",
    "National Football League player",
    "Football League player",
    "professional basketball referee",
    "basketball referee and coach",
    "basketball referee",
    "Hall of Fame softball player",
    "softball player",
    "motor racing team owner and constructor",
    "drag racing and hot rod pioneer",
    "motor racing team owner",
    "auto racing team owner",
    "car racing team owner",
    "auto racing pioneer",
    "car racing promoter",
    "racing team owner",
    "racing crew chief",
    "mountain biker",
    "mountain guide",
    "basketball star and innovator",
    "Hall of Fame rodeo cowboy",
    "rodeo cowboy",
    "rodeo rider",
    "rodeo",
]
sciences = [
    "car builder",
    "and enologist",
    "anatomist",
    "Internet software pioneer and computer prodigy",
    "computer expert",
    "quantum physics",
    "physics",
    "elephant expert",
    "CERN secretary",
    "herbalist",
    "Scientist and Medical Researcher",
    "mathematical",
    "game developer",
    "clinician",
    "Nurse Corps",
    "hospital corpsman",
    "genealogist",
]

business_farming = [
    "oilman",
    "real estate tycoon",
    "timber tycoon",
    "tire tycoon",
    "cattle baron",
    "real estate mogul",
    "automotive dealer",
    "insurance broker",
    "clothier and",
    "clothier",
    "whisky distiller",
    "drink distiller",
    "distiller",
    "car dealer",
]
academia_humanities = [
    "heraldry",
    "Esperantist",
    "Germanist",
    "Hellenist",
    "ethnographer",
]
law_enf_military_operator = [
    "East border guard",
    "Waffen SS  guard",
    "security guard",
    "guard",
    "security detective",
    "founder of Al Qaeda",
    "Al Qaeda",
    "Field Marshal",
    "Tzotzil Zapatista rebel",
    "rebel",
    "al Qaeda fighter in Chechnya",
    "oil well fire fighter",
    "independence fighter",
    "WW Special Forces saboteur and trainer of Violette Szabó",
    "resistance trainer",
    "ISIS trainer",
    "sergeant and recipient of the Victoria Cross",
    "war hero and recipient of the Victoria Cross",
    "Gurkha and recipient of the Victoria Cross",
    "Pashtun recipient of the Victoria Cross",
    "recipient of the Victoria Cross",
    "Navy chief cryptologic technician",
    "Navy Commander in Chief",
    "Secretary of the Navy",
    "Director of the Navy",
    "Navy Master Diver",
    "Navy SEAL sniper",
    "head of the Navy",
    "Navy captain",
    "Navy SEAL",
    "Navy",
    "bibliographer",
    "intelligence official who was Deputy Director of the Central Intelligence Agency",
    "intelligence official in the Central Intelligence Agency",
    "former head of East y secret intelligence service",
    "army Green Beret and intelligence sergeant major",
    "intelligence chief of the National Authority",
    "Chief of intelligence of South",
    "head of intelligence services",
    "imagery intelligence analyst",
    "intelligence specialist",
    "intelligence official",
    "intelligence analyst",
    "intelligence chief",
    "Air Force navigator",
    "flight navigator",
    "navigator",
    "marshal of the air force",
    "Air Force marshal",
    "air vice marshal",
    "vice marshal",
    "marshal",
]
spiritual = [
    "Hindu reformer",
    "Hindu guru",
    "Hinduism",
    "Hindu",
    "fortune teller made famous in Bruce Springsteen",
    "fortune teller",
    "ecclesiastical",
    "grand ayatollah",
    "Shi'i ayatollah",
    "ayatollah",
    "and a founding father of the New Age movement",
    "yogi and guru",
    "yogi",
    "Chabad Lubavitch Rabbi and secretary to Menachem Schneerson",
    "Lakota medicine man",
    "Twelver Marja'",
    "Marja'",
    "New Testament",
]
social = []
crime = [
    "failed suicide bomber",
    "organized crime figure in Detroit",
    "organized crime figure",
    "organised crime figure",
    "crime figure",
]
event_record_other = ["homeless man"]
other_species = [
    "Humboldt penguin",
    "penguin",
    "foaled thoroughbred horse",
    "thoroughbred horse",
    "Standardbred harness racing stallion",
    "breeding stallion and racing trotter",
    "barrel racing horse",
    "racing thoroughbred",
    "dressage horse and sire",
    "dressage horse",
    "canine star of sit com Eddie",
    "Great Dane",
]

<IPython.core.display.Javascript object>

In [34]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [35]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [36]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 2min 8s
Wall time: 2min 8s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
80318,28,Thomas Schäfer,", 54, German politician, Minister of Finance in Hesse , suicide by train.",https://en.wikipedia.org/wiki/Thomas_Sch%C3%A4fer,11,2020,March,since,,,Minister of Finance in Hesse,suicide by train,,,,,,,,54.0,,Germany,,since 2010,2.484907,0,0,0,0,0,0,0,0,1,0,0,0,1
1744,5,Jimmy Allen,", 85, English footballer and football manager.","https://en.wikipedia.org/wiki/Jimmy_Allen_(footballer,_born_1909)",8,1995,February,,,,,,,,,,,,,85.0,,United Kingdom of Great Britain and Northern Ireland,,,2.197225,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [37]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 1901 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [38]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [39]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [40]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "protest" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [41]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [42]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "protest"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [43]:
# Creating lists for each category
politics_govt_law = [
    "herald",
    "foreign policy analyst",
    "policy analyst",
    "food analyst",
    "baroness and",
    "baroness",
    "baronet",
    "baron",
    "transportation official",
    "transportation and agri",
    "Minister of Transport",
    "Supreme Court justice",
    "first female justice",
    "High Court justice",
    "associate justice",
    "justice",
    "secretary",
    "deputy tribal chief of the Nation",
    "traditional tribal chief",
    "Athabascan tribal chief",
    "Seminole tribal chief",
    "Mohegan tribal chief",
    "Wayana tribal chief",
    "tribal chief",
    "women rights pioneer",
    "activist",
    "implicated in the murder of Daniel Pearl",
    "industry communicator",
    "proliferation expert",
    "food security expert",
    "Minister of Communications",
    "Commissioner of Education",
    "Minister of Education",
    "Education Minister",
    "government adviser of Milton Obote and Idi Amin",
    "scientific adviser",
    "adviser",
    "libertarian",
    "tax protestor",
    "protestor",
    "butler and staffer",
    "butler",
    "finance official",
    "protest",
]

arts = [
    "ologist",
    "cultural analyst",
    "antiques expert",
    "rock concert promoter",
    "concert promoter",
    "landscape gardener",
    "gardener",
    "woodcarver",
    "Carna",
    "beer expert",
    "tailor",
    "blacksmith",
    "luthier who buildt archtop guitars",
    "luthier",
    "rock multi instrumentalist",
    "promoter of punk rock",
    "rock 'n' roll groupie",
    "rock concert promoter",
    "rock and roll",
    "promoter of LGBTQ cinema",
    "cinema  pioneer",
    "cinema",
    "festival organizer",
    "horn player",
    "photography",
    "actor",
    "dialect coach",
]
sports = [
    "eng",
    "archer",
    "bicycle",
    "basketball pioneer who popularized the sport in City",
    "and sport franchise owner",
    "extreme sport practitioner",
    "motor sport",
    "sporting",
    "alpinist",
    "world champion field handball team player",
    "handball and beach handball coach",
    "handball coach and player",
    "handball referee",
    "handball coach",
    "handballer",
    "motorcycling",
    "center on the University of Arizona women basketball team",
    "coach of Naval Academy men and women basketball teams",
    "One of the first women to swim the Channel",
    "MLB All Star catcher",
    "CFL coach",
    "freestyle motocross rider",
    "motocross champion",
    "motocross rider",
    "shogi player",
    "track coach",
    "VFL player",
    "wheelchair marathoner and paralympian",
    "paralympian",
    "professional bull rider",
    "backgammon player",
    "BMX rider",
    "coxswain",
]
sciences = [
    "computer",
    "ine builder",
    "systems analyst",
    "practitioner of traditional medicine",
    "alternative medicine practitioner",
    "hyperbaric medicine specialist",
    "forensic medicine",
    "clinical medicine",
    "social medicine",
    "medicine",
    "zookeeper",
    "biostatistics",
    "statistics",
    "automotive expert and",
    "automotive",
    "motoring",
    "anaesthetist",
    "nuclear",
    "bee expert",
    "rocket expert",
    "developer of the bulletproof Kevlar vest",
]

business_farming = [
    "home builder",
    "wallpaper heir and stepfather of Diana",
    "publishing heir and",
    "heir and",
    "heir",
    "casino owner",
    "arms dealer",
]
academia_humanities = [
    "Hispanist",
    "Mayanist",
    "pioneer of women studies",
]
law_enf_military_operator = [
    "munitions analyst",
    "seaman",
    "gendarme",
    "CIA official",
    "nuclear strategist",  # before sciences
]
spiritual = ["catechist", "Salvation Army General"]
social = [
    "cancer research fundraiser",
    "hospice fundraiser",
    "and fundraiser",
    "fundraiser",
    "co founder of the Findhorn Foundation",
]
crime = [
    "drug dealer",
    "Genovese family crime boss",
    "organized crime boss",
    "crime boss",
]
event_record_other = [
    "last surviving Kindertransport mother",
    "pensioner",
    "cattle grazier",
    "obese man",
    "primordial dwarf",
    "dwarf",
]
other_species = [
    "Asian elephant",
    "elephant",
    "Groundhog Day prognosticator",
    "world longest domestic cat",
    "pet cat of George W Bush",
    "Downing Street cat",
    "domestic cat in",
    "polydactyl cat",
    "Himalayan cat",
    "Ragdoll cat",
    "Library cat",
    "calico cat",
    "tabby cat",
    "obese cat",
    "stray cat",
    "victim",
    "Thoroughbred horse",
    "world champion bucking bull",
    "bucking bull",
    "show jumping horse",
    "eventing horse",
    "thoroughbred stallion",
    "thoroughbred",
]

<IPython.core.display.Javascript object>

In [44]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

In [45]:
# Hard-coding info_2 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Norm_Grabowski"].index
df.loc[index, "info_2"] = "engine builder"

# Hard-coding info_2 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Dimitris_Christoulas"].index
df.loc[index, "info_2"] = "activist"  # added to dict

# Hard-coding info_2 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Waldo_McBurney"].index
df.loc[index, "info_2"] = ""

# Hard-coding info_2 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Murder_of_Biswajit_Das"].index
df.loc[index, "info_2"] = "victim"  # added to dict

# Hard-coding info_2 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Eric_the_Actor"].index
df.loc[index, "info_2"] = "actor"  # added to dict

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [46]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "sciences": sciences,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [47]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 28s
Wall time: 1min 28s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
16402,16,Jewell Young,", 90, American professional basketball player .",https://en.wikipedia.org/wiki/Jewell_Young,4,2003,April,"Purdue University, Indianapolis Kautskys, Oshkosh All Stars",,,,,,,,,,,,90.0,,United States of America,,"Purdue University, Indianapolis Kautskys, Oshkosh All Stars",1.609438,0,0,0,0,0,0,1,0,0,0,0,0,1
72371,1,Freddie Glidden,", 91, Scottish footballer .",https://en.wikipedia.org/wiki/Freddie_Glidden,5,2019,January,"Hearts, Dumbarton",,,,,,,,,,,,91.0,,Scotland,,"Hearts, Dumbarton",1.791759,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [48]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 1697 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [49]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [50]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [51]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "rhinoceros" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [52]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [53]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "flight attendant"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [54]:
# Creating lists for each category
politics_govt_law = [
    "Surveyor of thepretender to the throne of the Kingdom of Araucania and Patagonia",
    "known for campaigns against fast food chains and food processors",
    "press official under Richard M Nixon and Gerald R Ford",
    "former Imperial Wizard of the Ku Klux Klan",
    "Guantanamo Bay detention camp prisoner",
    "Ku Klux Klan member and segregationist",
    "United Nations disarmament official",
    "Queen of as the wife of King Talal",
    "sister of King Abdullah of Arabia",
    "Queen consort of Frederick IX of",
    "Queen consort of King Zog I of",
    "queen consort of King Baudouin",
    "th Emperor King of the people",
    "servant to HM Queen Elizabeth",
    "wrongly imprisoned prisoner",
    "who prosecuted Jimmy Hoffa",
    "consort of King George VI",
    "foreign policy official",
    "sister of King Abdullah",
    "foreign policy expert",
    "Ku Klux Klan member",
    "exonerated prisoner",
    "juridical official",
    "prisoner in gulag",
    "the last Queen of",
    "health reformer",
    "King of Wallis",
    "state official",
    "communicator",
    "door to door",
    "Arabian King",
    "ex prisoner",
    "suffragist",
    "Queen of",
    "marxist",
    "King of",
    "women",
    "King",
    "egal",
]

arts = [
    "traditional Māori instrumentalist",
    "health and fitness pioneer",
    "bluegrass mandolin player",
    "silversmith and goldsmith",
    "multi instrumentalist",
    "burlesque stripteaser",
    "bluegrass mandolinist",
    "classical mandolinist",
    "silver and goldsmith",
    "Emmy Award winning",
    "burlesque stripper",
    "pageant contestant",
    "circus ringmaster",
    "euphonium player",
    "antiques dealer",
    "mandolin player",
    "instrumentalist",
    "uilleann piper",
    "pageant winner",
    "pageant queen",
    "watercolorist",
    "silversmith",
    "goldsmith",
    "sitarist",
    "stripper",
    "cantor",
    "rock",
]
sports = [
    "National Football League coach and a member of the Pro Football Hall of Fame",
    "Football League coach with the Toronto Argonauts and Ottawa Rough Riders",
    "former coach of the Toronto Rock of the National Lacrosse League",
    "Virgin Islander Baltimore Orioles coach",
    "National Rugby League player and coach",
    "sprint kayaker and Paralympic coach",
    "National Football League official",
    "Basketball Hall of Fame coach",
    "cross country running coach",
    "coach of the Boston Celtics",
    "basketball official scorer",
    "swimming and diving coach",
    "draughts player and coach",
    "yoga as exercise pioneer",
    "fencing master and coach",
    "coach for A&M basketball",
    "futsal player and coach",
    "rugbyplayer and coach",
    "basketball official",
    "rugby league coach",
    "hammer throw coach",
    "ski jumping coach",
    "equitation coach",
    "water polo coach",
    "trampoline coach",
    "high jump coach",
    "croquet player",
    "hurling coach",
    "fencing coach",
    "qigong master",
    "cycling coach",
    "skiing coach",
    "rugby coach",
    "yoga master",
    "rejoneador",
    "judo coach",
    "swim coach",
    "yoga guru",
    "angler",
    "coach",
]
sciences = [
    "holistic health practitioner and naturopath",
    "mental health professional",
    "health care consultant",
    "aquarium official",
    "fluid dynamicist",
    "climate analyst",
    "forestry expert",
    "Jungian analyst",
    "mineralogist",
    "orthopedist",
    "forestry",
]

business_farming = [
    "founder of Hardee fast food restaurant",
    "investment advisor",
    "fast food pioneer",
    "land developer",
    "internet",
    "salesman",
    "finance",
    "builder",
    "grocer",
]
academia_humanities = [
    "Queen Works of Art",
    "Queen Pictures",
    "conservator",
    "transport",
]
law_enf_military_operator = [
    "Army master sergeant and recipient of the Medal of Honor",
    "Army sergeant and a recipient of the Medal of Honor",
    "Army Special Forces staff sergeant during the War",
    "Army Air Forces bombardier aboard the atomic bomb",
    "former sheriff of San Francisco and Cleveland",
    "Federal Aviation Administration official",
    "Lieutenant General of the People Army of",
    "Army combat medic and an armed member of",
    "Head of the Secret Intelligence Service",
    "head of the Secret Intelligence Service",
    "Central Intelligence Agency official",
    "Director of the Intelligence Bureau",
    "Major General in the Army Reserves",
    "Federal Judicial Police official",
    "chief air accident investigator",
    "Army command sergeant major",
    "sheriff of Dallas County",
    "Army Private First Class",
    "Army Lieutenant General",
    "ninth chief of the Army",
    "NCO and prisoner of war",
    "retired Army lieutenant",
    "Admiral of the Fleet",
    "private investigator",
    "Federal investigator",
    "Army first Sergeant",
    "Army Sergeant Major",
    "Officer in the Army",
    "technical sergeant",
    "deputy sheriff and",
    "fire investigator",
    "security official",
    "security expert",
    "Coast Guardsman",
    "prisoner of war",
    "Army brigadier",
    "county sheriff",
    "bomber crewman",
    "Army Sergeant",
    "Army corporal",
    "Rear Admiral",
    "investigator",
    "Army General",
    "Army Officer",
    "Army major",
    "serviceman",
    "paramedic",
    "maritime",
    "sheriff",
    "Admiral",
]
spiritual = [
    "founder of The Jesus Army",
    "evangelical Old Testament",
    "Sōtō Zen master",
    "evangelical",
    "Zen master",
    "religious",
    "of Islam",
    "mystic",
]
social = ["co creator of Trick or Treat for UNICEF"]
crime = ["waitress who converted to Islam", "serial burglar", "felon"]
event_record_other = [
    "Shortest adult human",
    "flight attendant",
    "health worker",
    "WWII prisoner",
]
other_species = [
    "trained chimpanzee believed to be first non human to acquire human language",
    "Baringo giraffe at the Santa Barbara Zoo with neck deformity",
    "and last surviving female northern white rhinoceros",
    "northern white rhinoceros",
    "giant panda",
    "chimpanzee",
    "rhinoceros",
    "amnesiac",
    "giraffe",
    "panda",
]

<IPython.core.display.Javascript object>

In [55]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

In [56]:
# Hard coding info_2 value for entries to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Lykourgos_Angelopoulos"].index
df.loc[index, "info_2"] = "religious cantor"  # religious added to dict

index = df[df["link"] == "https://en.wikipedia.org/wiki/Esther_Ghan_Firestone"].index
df.loc[index, "info_2"] = "religious cantor"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [57]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [58]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 39s
Wall time: 1min 39s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
68700,22,Dave Nelson,", 73, American baseball player , liver cancer.",https://en.wikipedia.org/wiki/Dave_Nelson,9,2018,April,"Rangers, Cleveland Indians and broadcaster Milwaukee Brewers",,,liver cancer,,,,,,,,,73.0,,United States of America,United States of America,"Rangers, Cleveland Indians and broadcaster Milwaukee Brewers",2.302585,0,0,0,0,0,0,1,0,0,0,0,0,1
19569,14,Benny Bailey,", 79, American jazz trumpeter.",https://en.wikipedia.org/wiki/Benny_Bailey,3,2005,April,,,,,,,,,,,,,79.0,,United States of America,,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [59]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 1476 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [60]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [61]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [62]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "Hall of Fame" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [63]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [64]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "Hall of Fame"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [65]:
# Creating lists for each category
politics_govt_law = [
    "emir of ISIL in",
    "emir of",
    "emir",
    "Emir of Agaie",
    "Emir",
    "opium",
    "drug counselor",
    "labor organizer",
    "Associate Justice of the Supreme Court of the",
    "th Chief Justice of the Supreme Court",
    "Chief Justice of the Supreme Court of",
    "Chief Justice of the High Court of",
    "Chief Justice of the Supreme Court",
    "Justice of the Supreme Court",
    "Jurist and Chief Justice of",
    "Chief Justice from June",
    "Lord Chief Justice of",
    "Minister of Justice",
    "Chief Justice of",
    "Justice Minister",
    "Chief Justice",
    "Surveyor of the",
    "Maryland State Senator since",
    "Senator from Wyoming since",
    "Senator for Missouri",
    "State Senator",
    "Senator from",
    "Senator",
    "Union Gulag inmate",
    "sultan of the Aussa Sultanate",
    "foreign policy consultant",
    "Assistant Majority Leader in the Illinois House of Representatives",
    "member of the House of Representatives from Virginia since",
    "member of the House of Representatives from Indiana since",
    "member of the House of Representatives from Ohio since",
    "member of the House of Representatives from Illinois",
    "member of the House of Representatives from Nebraska",
    "member of the House of Representatives from New",
    "member of the Alabama House of Representatives",
    "member of the House of Representatives from",
    "member of the House of Representatives for",
    "Speaker of the House of Representatives",
    "member of the House of Representatives",
    "former Democratic Representative from",
    "Representative from Ohio since",
    "Representative from Washington",
    "Representative from Tennessee",
    "Democratic Representative for",
    "Representative from Michigan",
    "Representative for Wisconsin",
    "Representative from Maryland",
    "Representative for Oklahoma",
    "Democratic Representative",
    "Representative from since",
    "Representative from",
    "Representative for",
    "official",
    "governor of the Gila River Community",
    "governor of Bermuda from to",
    "Central Bank governor",
    "provincial governor",
    "governor",
    "black separatist",
    "pro separatist",
    "separatist",
]

arts = [
    "analyst",
    "adult",
    "Academy Award winning",
    "Bodil Award winning",
    "wargamer",
    "co founder of Filmation animation studios",
    "co creator of",
    "perfumer",
    "woodworker",
    "interior design consultant",
    "consultancy",
    "game show contestant",
]
sports = [
    "sport",
    "three cushion billiards player",
    "three cushion billiard player",
    "champion billiards player",
    "pocket billiards player",
    "billiards player",
    "goalkeeper",
    "cycling",
    "Lions rugby league captain",
    "rugby league referee",
    "rugby",
    "camogie player",
]
sciences = [
    "physical",
    "agriculturalist and recipient of the World Food Prize",
    "agriculturalist",
    "hydrographer",
    "Packard",
    "co founder of ECD Ovonics",
    "sociology and",
    "sociology",
    "emergency care",
    "quality consultant",
    "health",
    "pathology",
]

business_farming = [
    "co founder of Snapple Beverage Corporation",
    "co founder of Häagen Dazs ice cream",
    "co founder of Gaydar dating website",
    "co founder of E & J Gallo Winery",
    "co founder of Hewlett",
    "insurance consultant",
]
academia_humanities = ["co founder of the River and Rowing Museum", "anthropology"]
law_enf_military_operator = [
    "recipient of Righteous Among the Nations award",
    "Holocaust heroine and award winner",
    "war hero  Metropolitan",
    "war hero and",
    "airline stewardess",
    "warlady",
    "war heroine",
    "combat medic",
    "combat",
    "Scout and recipient of the Medal of Honor",
    "Army",
    "museum board member",
    "museum owner",
    "Air Force loadmaster and recipient of the Medal of Honor",
    "Commander of the National Air Force",
    "Chief of Staff of the Air Force",
    "General in the Air Force",
    "Air Force Commander",
    "Air Force Colonel",
    "Air Force",
]
spiritual = [
    "Old Testament",
    "Islam",
    "second counselor in the First Presidency of the LDS Church",
    "Cardinal Deacon of San Francesco Saverio alla Garbatella",
    "Cardinal for San Pietro in Vincoli",
    "Cardinal",
    "motivational consultant",
    "Mormon",
]
social = [
    "substance abuse counselor",
    "family counselor",
    "drug counselor",
    "co founder of Habitat for Humanity International",
    "co founder of Twin Oaks Community",
    "professional Scouter for the Boy Scouts of",
    "oldest Girl Scout",
    "Scouter",
    "philanthropy consultant",
]
crime = [
    "death row inmate",
    "master counterfeiter",
    "counterfeiter",
    "vigilante",
    "polygamist",
]
event_record_other = []
other_species = [
    "war canine",
]

<IPython.core.display.Javascript object>

In [66]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [67]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [68]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 14s
Wall time: 1min 14s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
13021,2,James P. Vreeland,", 91, American Republican Party politician.",https://en.wikipedia.org/wiki/James_P._Vreeland,13,2001,July,,,,,,,,,,,,,91.0,,United States of America,,,2.639057,0,0,0,0,0,0,0,0,1,0,0,0,1
17469,14,Jeanne Crain,", 78, American actress, heart attack.",https://en.wikipedia.org/wiki/Jeanne_Crain,16,2003,December,,,,heart attack,,,,,,,,,78.0,,United States of America,,,2.833213,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [69]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 1338 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [70]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [71]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [72]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "premier" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [73]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [74]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "premier"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [75]:
# Creating lists for each category
politics_govt_law = [
    "former Deputy Chief Minister of Maharashtra",
    "Chief Minister of Arunachal Pradesh",
    "stenographer to Winston Churchill",
    "former Chief Minister of Haryana",
    "who was first mayor of ancestry",
    "former premier of New Brunswick",
    "long serving mayor of Rosemont",
    "Minister for Foreign Affairs",
    "Deputy Premier of New South",
    "Federal Appeals Court Judge",
    "District and Circuit Judge",
    "mayor of Ocean Breeze Park",
    "Minister for Shipping and",
    "Foreign Affairs Minister",
    "Chief Minister of Delhi",
    "mayor elect of Westlake",
    "mayor of Atlantic City",
    "mayor of Nefteyugansk",
    "Public Works Minister",
    "Premier of East Sepik",
    "mayor of Vladikavkaz",
    "Minister of Industry",
    "member of Parliament",
    "mayor of La Gacilly",
    "nd mayor of Atlanta",
    "Minister for Sport",
    "Minister of Power",
    "mayor of Nagasaki",
    "county councillor",
    "Interior Minister",
    "Parliament member",
    "Cabinet Minister",
    "Foreign Minister",
    "boy demonstrator",
    "mayor of Buffalo",
    "mayor of Raleigh",
    "mayor of Dallas",
    "Chief Minister",
    "District Judge",
    "mayor of Flint",
    "Circuit Judge",
    "Vice Premier",
    "Premier of",
    "kidnapping",
    "prisoner",
    "mayor of",
    "premier",
    "inmate",
    "miner",
    "mayor",
]
arts = [
    "counter cultural figure",
    "counterculture figure",
    "counterculture",
    "country and",
    "comedy club owner",
    "club owner",
    "vedette",
    "B boy",
    "wilderness",
    "co founder of",
    "bagpiper",
    "bookbinder and craftsman",
    "craftsman",
    "wood carver",
    "carver",
    "timpanist",
    "hymnist",
    "script supervisor",
    "lithographer",
    "saxophone player",
    "muse",
    "food",
    "oud player",
    "conga player",
    "pioneering record promoter",
    "record company founder",
    "recording studio owner",
    "record label founder",
    "record label owner",
    "record store owner",
    "sound recording",
    "record promoter",
    "record dealer",
    "record",
    "steel guitar player for Vince Gill and Conway Twitty",
    "classical slide guitar player",
    "steel guitar player",
    "bass guitar player",
    "classical guitar",
    "guitar player",
    "guitar",
    "comedienne",
    "tuba player",
    "founder of Judges Guild",
    "boogie woogie",
    "santoor player",
    "winner of Nobel Prize for Literature",
]
arts = sorted(
    list(set(arts)), key=lambda x: len(x), reverse=True
)  # Sorting by decreasing length and removing duplicates
sports = [
    "member of the National Cowgirl Museum and Hall of Fame",
    "National Basketball Association Hall of Famer",
    "Hall of Fame show jumping rider",
    "Hall of Fame NASCAR team owner",
    "NASCAR Hall of Fame team owner",
    "Football Hall of Fame member",
    "Hall of Fame horsebreeder",
    "Pro Rodeo Hall of Famer",
    "cheerleading fan of the Dallas Cowboys",
    "survivalist",
    "multi discipline pistol shooter",
    "Paralympic shooter",
    "skeet shooter",
    "rifle shooter",
    "trap shooter",
    "futsal player",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)
sciences = [
    "probabilist",
    "forensic laboratory technician",
    "forensic expert",
    "forensic",
    "IT expert",
    "wildlife expert",
    "orthodontist",
    "pioneer in the field of magnetic tape sound",
    "Computers and Information at the University of Cambridge",
    "Computer Science",
    "biology",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)
business_farming = [
    "developer of a country club that did not admit black members",
    "accountancy",
    "accounting",
    "sex club owner",
    "multi millionaire",
    "multimillionaire",
    "millionaire",
    "ship owner",
    "founding head of Air Tahiti Nui",
    "CEO of Northwest Airlines",
    "accounting",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "analytic",
    "bibliophile",
    "semiotician",
    "consultant",
    "historical",
    "toponymist",
    "historiograph",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "encounter specialist with Mumbai Police",
    "war",
    "schoolboy",
    "survivor of Bataan Death March",
    "former KGB chief",
    "head of the KGB",
    "code breaker",
    "Air Vice Marshal",
    "RAAF Air Marshal",
    "Air Marshal",
    "hero of Liberation War",
    "first Minister of Defense",
    "Minister for Defence",
    "Minister of Defence",
    "Defense Minister",
    "chief of staff of the Provisional IRA",
    "navy chief of staff",
    "chief of staff",
    "member of the Tuskegee Airmen",
    "Chief of the Air Staff",
    "Tuskegee Airman",
    "informant for the FBI",
    "informant",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "Chief Rabbi of and of",
    "Chief Rabbi",
    "Rabbi",
    "salvationist",
    "Bishop of the Episcopal Diocese of Northern Michigan",
    "th Bishop of Massachusetts in The Episcopal Church",
    "Episcopal Bishop of West Tennessee",
    "Episcopal Bishop of Northwestern",
    "Auxiliary Bishop of Cincinnati",
    "Anglican Bishop of the Arctic",
    "Bishop of Mashonaland from to",
    "Bishop of Knaresborough",
    "Church of Bishop of",
    "Bishop of Malolos",
    "Bishop of Dallas",
    "Bishop of Oxford",
    "Anglican Bishop",
    "Bishop of Ripon",
    "Bishop of Selby",
    "Bishop of Como",
    "Bishop",
    "Saivite guru",
    "guru",
    "Executive Minister of the Iglesia ni Cristo",
    "oldest member of the Quorum of the Twelve Apostles in the history of The Church of Jesus Christ of Latter day Saints",
    "member of the Quorum of the Twelve Apostles of The Church of Jesus Christ of Latter day Saints",
    "patriarch emeritus of The Church of Jesus Christ of Latter day Saints",
    "apostle of The Church of Jesus Christ of Latter day Saints",
    "Elder of The Church of Jesus Christ of Latter day Saints",
    "Secretary General of the General Synod of the Church of",
    "Orthodox patriarch of the Orthodox Church of Antioch",
    "Supreme Primate of the Malankara Orthodox Church",
    "Pope of the Coptic Orthodox Church of Alexandria",
    "Metropolitan of Malankara Orthodox Church",
    "patriarch of the Orthodox Tewahedo Church",
    "primate of the Malankara Orthodox Church",
    "Protopresbyter of the Orthodox Church",
    "primate of the Orthodox Church since",
    "th Patriarch of the Orthodox Church",
    "Patriarch of the Orthodox Church",
    "Orthodox Church hierarch",
    "primate of the Church of",
    "Church of Bishop of",
    "LDS Church",
    "Wiccan",
    "spiritual healer",
    "spiritual figure",
    "spiritual",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
crime = [
    "bank robber and prison escapee of the s",
    "bank robber and con man",
    "bank robber",
    "reputed mafia enforcer",
    "mafia boss",
    "mafia",
    "suicide bomber and assassin of Beant Singh",
    "suicide bomber",
    "hotel bomber",
    "bomber",
    "'Ndrangheta boss",
    "school shooter",
    "alleged Mafia boss of Pittsburgh",
    "member of the Mafia",
    "former Mafia boss",
    "Mafia underboss",
    "Mafia hit man",
    "Mafia member",
    "Mafia boss",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "survivor of Hiroshima and Nagasaki atomic bombings",
    "survivor of the Sant'Anna di Stazzema massacre",
    "survivor of the sinking of RMS",
    "oldest living survivor of the",
    "last survivor of the sinking",
    "survivor of the Explosion",
    "Sinti Porajmos survivor",
    "survivor of Auschwitz",
    "shipwreck survivor",
    "abortion survivor",
    "gunshot survivor",
    "persistent vegetative state patient",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "cat",
]

<IPython.core.display.Javascript object>

In [76]:
# Dropping entry with link that points to event page rather than individual page
index = df[df["link"] == "https://en.wikipedia.org/wiki/Barbara_McDermott"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

# Dropping entry with link that points to event page rather than individual page
index = df[
    df["link"]
    == "https://en.wikipedia.org/wiki/2005_French_riots#Murders_of_Jean-Claude_Irvoas_and_Jean-Jacques_Le_Chenadec"
].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

# Dropping entry with link that points to event page rather than individual page
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Permanente_Quarry#2011_shooting"
].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

# Dropping entry with link that points to event page rather than individual page
index = df[df["link"] == "https://en.wikipedia.org/wiki/Glen_Doherty"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [77]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [78]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 58s
Wall time: 1min 58s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
75809,18,Bob Frank,", 75, American singer-songwriter.",https://en.wikipedia.org/wiki/Bob_Frank,6,2019,July,,,,,,,,,,,,,75.0,,United States of America,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
45359,29,John Spence,", 95, American World War II veteran, first combat frogman .",https://en.wikipedia.org/wiki/John_Spence_(frogman),3,2013,October,diver,,,first combat frogman,,,,,,,,,95.0,,United States of America,,diver,1.386294,0,0,0,0,0,0,0,1,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [79]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 1082 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [80]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [81]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [82]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "Black Panther" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [83]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [84]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "Sultan of Sokoto"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [85]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "sultan",
    "count",
    "Legislative Council member and chair of the DAB",
    "member of the Legislative Assembly of Alberta",
    "first female Lord Mayor of",
    "Lord Mayor of Birmingham",
    "first Mayor of Atlanta",
    "Lord Mayor of Brisbane",
    "Mayor of Pittsburgh",
    "Mayor of Auckland",
    "Mayor of Edmonton",
    "rd Mayor of City",
    "th Mayor of City",
    "Lord Mayor of",
    "Mayor of",
    "Chief of Staff and Treasury Secretary during the Reagan administration",
    "Secretary General of Karen National Union",
    "Secretary of the Interior since",
    "Assistant Secretary of Health",
    "Secretary of Commerce",
    "Secretary of Interior",
    "Secretary of State",
    "Finance Secretary",
    "Press Secretary",
    "Conservative member of the House of Lords",
    "crossbench member of the House of Lords",
    "Lord Lieutenant of Lincolnshire since",
    "Lord Lieutenant of Buckinghamshire",
    "Lord Lieutenant of Merseyside",
    "Member of the House of Lords",
    "first female Lord Mayor of",
    "Lord Provost of Aberdeen",
    "Lord Mayor of Birmingham",
    "Lord Mayor of Brisbane",
    "Lord Mayor of",
    "Law Lord",
    "founder of VHP",
    "Attorney General of Alabama",
    "Attorney General of",
    "Assistant Attorney",
    "member of the North Carolina General Assembly",
    "member of the Legislative Assembly of Alberta",
    "congressman from Tennessee and",
    "first openly gay congressman",
    "congressman from Indiana",
    "congressman from Idaho",
    "fourth and last wife of Sir Sultan Muhammad Shah",
    "Sultan of Sokoto",
    "Black Panther member",
    "Black Panther",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "Hall of Fame",
    "player of cello and viola da gamba",
    "flute and tin whistle player",
    "classical surbahar player",
    "bass and trombone player",
    "Creole fiddle player",
    "shakuhachi player",
    "didgeridoo player",
    "harmonium player",
    "gayageum player",
    "tárogató player",
    "bouzouki player",
    "requinto player",
    "clarinet player",
    "sarangi player",
    "ukulele player",
    "fiddle player",
    "cuatro player",
    "yidaki player",
    "sitar player",
    "gyile player",
    "veena player",
    "brass player",
    "gusle player",
    "piano player",
    "flute player",
    "pipa player",
    "dhol player",
    "bureau chief in Washington for the",
    "owner of Drury Lane Theatre",
    "owner of Fy Records",
    "bookshop owner",
    "boutique owner",
    "press owner",
    "bar owner",
    "bookstore proprietor",
    "audiobook narrator",
    "bookshop owner",
    "book reviewer",
    "bookbinder",
    "book",
    "controller of BBC Radio and The Proms",
    "controller of the BBC in Northern",
    "creator of Dudley Do Right and The Munsters",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "former MLB player for the Brooklyn Dodgers",
    "National Basketball Association player",
    "MLB player with the Boston Braves",
    "basketball and streetball player",
    "professional video game player",
    "Magic: The Gathering player",
    "shinty player and referee",
    "Paralympic boccia player",
    "Rugby Union player and",
    "world champion player",
    "racquetball player",
    "Euroleague player",
    "checkers player",
    "goalball player",
    "pétanque player",
    "kabaddi player",
    "pelota player",
    "bandy player",
    "ballplayer",
    "ice player",
    "NHL player",
    "go player",
    "and the Columbus Blue Jackets",
    "former owner of the Philadelphia ers who signed Julius Erving",
    "primary owner of the Cincinnati Reds",
    "owner of the NHL Chicago Blackhawks",
    "majority owner of St Louis Rams",
    "owner of Kansas City Chiefs",
    "owner of the Boston Red Sox",
    "Formula One team owner",
    "owner of hot rod shop",
    "stock car team owner",
    "owner of the Jets",
    "basketball owner",
    "Giants co owner",
    "CEO of the International Rugby Board",
    "CEO of NASCAR",
    "NHL goaltender and Vezina Trophy winner",
    "owner of the NHL Chicago Blackhawks",
    "NHL player",
    "promoter of Lucha Libre AAA World Wide",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "recipient of the Nobel Prize in Physiology or Medicine",
    "who created APT and led MIT CAD project",
    "of Science and Technology",
    "disk drive pioneer",
    "death from a fall off a bridge",
    "founder of freenode",
    "creator of Snobol and Icon programming languages",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "owner of Worthington Industries",
    "owner of Taillevent restaurant",
    "owner of Ameriquest Mortgage",
    "construction company owner",
    "steel company owner",
    "coffee house owner",
    "theme park owner",
    "gun shop owner",
    "company owner",
    "brothel owner",
    "airline owner",
    "estate owner",
    "zoo owner",
    "mine owner",
    "bar owner",
    "founder and CEO of Cubic Corporation",
    "founder and former CEO of ARCO",
    "CEO of Manufacturers Hanover",
    "CEO of The Hertz Corporation",
    "CEO of Xerox Corporation",
    "CEO of Jaguar Land Rover",
    "CEO of Sigma TV",
    "CEO of Chemoil",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "owner of the Military Vehicle Technology Foundation",
    "traditional owner",
    "founder of the SAIS Studies program",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "recipient of the George Cross and George Medal",
    "recipient of the Distinguished Conduct Medal",
    "recipient of the George Cross",
    "Victoria Cross recipient",
    "Military Medal recipient",
    "George Cross recipient",
    "Secretary of Defense under Reagan",
    "Watergate scandal principal",
    "Chief Constable of Greater Manchester Police",
    "Chief Constable for West Yorkshire",
    "Woman Police Constable",
    "small arms expert",
    "alleged Holocaust perpetrator",
    "last surviving Comanche code talker",
    "captain of the USS",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "and  and daughter of General Secretary Leonid Brezhnev",
    "founder of the Emmaüs movement",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = ["perpetrator", "stalker", "who hijacked a plane to"]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "survivor",
    "boy",
    "oldest man in the at the time of his death",
    "who was the recognised world oldest person",
    "oldest living person of a possession",
    "and Manitoban oldest verified person",
    "oldest recognized person in the",
    "oldest living man",
    "oldest man",
    "owner of the Amityville Horror house",
    "barbecue stand owner",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "tortoise believed to be the third oldest animal in the world and allegedly owned by Charles Darwin",
    "Jack Russell terrier",
    "dachshund terrier",
    "terrier",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [86]:
# Dropping entry with link that points to husband's page
index = df[df["link"] == "https://en.wikipedia.org/wiki/Walter_Gretzky"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

# Hard-coding cause_of_death for entry with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Alan_MacDiarmid"].index
df.loc[index, "cause_of_death"] = "fall off bridge"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [87]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [88]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_cat egories']!=0].sample(2)

KeyError: 'num_cat egories'

<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [89]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 902 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [90]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [91]:
# Code to check each value
roles_list.pop()

''

<IPython.core.display.Javascript object>

In [92]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [
            index
            for index in df[df["info_2"].notna()].index
            if "Prince" in df.loc[index, "info_2"]
        ],
        "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [93]:
# Viewing list sorted by descending length to copy to dictionary below and screen values
sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

['Prince and Grand Master of the Sovereign Military Order of',
 'Princess and daughter of the Shah of',
 'reigning Prince of since',
 'former Princess of',
 'Prince']

<IPython.core.display.Javascript object>

In [94]:
# Example code to quick-check a specific entry
df[df["info_2"] == "advisor to Henry Kis"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [95]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Law and Dean of Law School",
    "Law School",
    "expert on Latin politics",
    "drug control expert",
    "agricultural expert",
    "welfare expert",
    "Head of the House of Hohenzollern Sigmaringen",
    "member of the House of Bourbon Two Sicilies",
    "member of the House of Hohenzollern",
    "last member of the House of Romanov",
    "Speaker of the House of Commons",
    "member of the House of Commons",
    "member of House of Commons",
    "hereditary chief of the Golden Hill Paugussett Nation since",
    "th chief of the Clan MacLeod",
    "Temagami First Nation chief",
    "traditional chief",
    "Neskonlith chief",
    "Crow Creek chief",
    "Blackfeet chief",
    "Catawba chief",
    "Māori chief",
    "at the College of Arms in",
    "member of the State Legislature since",
    "Business School",
    "queen consort of as wife of Shah Mohammad Reza Pahlavi",
    "rain queen of the Balobedu people of",
    "Māori queen",
    "Tuʻi Pelehake",
    "Ambassador to the Holy See",
    "Ambassador to",
    "Princess and daughter of the Shah of",
    "reigning Prince of since",
    "former Princess of",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "expert on the pipe organ",
    "gardening expert",
    "origami expert",
    "culinary",
    "flavorist who created the coating for Cap'n Crunch",
    "Playmate of the Year",
    "founder of Portmeirion Pottery",
    "talk",
    "pop queen",
    "Dean of Hypnotists",
    "late night broadcast pioneer",
    "reality series cast member",
    "broadcasting pioneer",
    "cast member of",
    "quilting",
    "Drummer for The Beau Brummels and Harpers Bizarre",
    "founder of Sonnabend Gallery",
    "creator of the Howdy Doody puppet",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "player",
    "survival expert",
    "five time world bridge champion",
    "contract bridge champion",
    "contract bridge",
    "dual medalist in slalom",
    "and national team captain",
    "NASCAR crew chief",
    "National Boxing Association middleweight champion",
    "National Football League header with the Pittsburgh Steelers and the Tampa Bay Buccaneers",
    "former defensive back for the Football Leagues Saskatchewan Roughriders",
    "commissioner of the National Football League",
    "Commissioner of the Football League",
    "casteller",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "co developer of the Billings ovulation method",
    "infectious disease expert credited with the HPV and rotavirus vaccines",
    "expert on survey methodology",
    "Mac OS X Server expert",
    "scientific expert",
    "acoustics expert",
    "fisheries expert",
    "snake expert",
    "shoe expert",
    "periodontist",
    "Nobel Prize in Medicine",
    "Medicine",
    "Sciences",
    "Microsoft pioneer",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "oil and gas expert",
    "wine expert",
    "last original tenant of the Empire State Building",
    "second richest man of and founder of Takefuji Corporation",
    "founder of Bob Evans Restaurants",
    "founder of Crazy John mobile phone retail chain",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "founder of Tennessee Temple University",
    "world renowned literacy expert",
    "expert on porcelain",
    "expert on history",
    "Esperanto expert",
    "Ferrari expert",
    "long serving Officer of Arms",
    "former Provost of Oriel College",
    "principal of Wolfson College",
    "fifth dean of the",
    "dean of the Yale University School of",
    "University dean of Arts and",
    "fifth dean of the Business School",
    "dean of the Academy",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "security and defence expert",
    "bomb disposal expert",
    "fingerprint expert",
    "polygraph expert",
    "additional commissioner of Mumbai Police",
    "Federal Police assistant commissioner",
    "first City Police Commissioner",
    "with Greater Manchester Police",
    "National Police Commissioner",
    "captain of the Hokulea first voyage from Hawaii to Tahiti",
    "Antarctic research ship captain",
    "captain and Hero of the Union",
    "captain of the SS",
    "airline captain",
    "SS captain and",
    "captain of",
    "chief of the Mumbai Anti Terrorist Squad",
    "secret service chief",
    "Secret Service chief",
    "air force chief",
    "defence chief",
    "militia chief",
    "army chief",
    "navy chief",
    "fire chief",
    "Former Chief Military",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "Dean of York from",
    "Prince and Grand Master of the Sovereign Military Order of",
    "founding abbot of Monks of New Skete",
    "abbot",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "executed for the rape and murder of two teenage girls in",
    "who arranged the murder of his mistress",
    "domestic worker executed in for murder",
    "executed for a double murder",
    "executed for murder in",
    "executed for the rape and murder of two teenage girls in",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "key witness in Emmett Till murder trial",
    "brain injured girl claimed to have performed miracles",
    "hermit",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "orca at SeaWorld San Antonio",
    "orca of and fame",
    "held orca",
    "orca",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [96]:
# Dropping entry with link that points to a list of individuals rather than one individual
index = df[df["link"] == "https://en.wikipedia.org/wiki/Camille_Loiseau"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [97]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [98]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 10s
Wall time: 1min 10s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
34951,19,Tom Hungerford,", 96, Australian author.",https://en.wikipedia.org/wiki/Tom_Hungerford,4,2011,June,,,,,,,,,,,,,96.0,,Australia,,,1.609438,0,0,0,0,0,1,0,0,0,0,0,0,1
41259,28,Fyodor Arkhipenko,", 91, Soviet-Belorussian pilot.",https://en.wikipedia.org/wiki/Fyodor_Arkhipenko,5,2012,December,,,,,,,,,,,,,91.0,,United Socialist Soviet Republic,,,1.791759,0,0,0,0,0,0,0,1,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [99]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 785 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [553]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [552]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [551]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "founder of" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [550]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [549]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "ist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [546]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Teachta Dála for Cork East",
    "Fianna Fáil Teachta Dála",
    "congress member",
    "cultural figure",
    "accused of involvement in attempt by Mehmet Ali Ağca to kill Pope John Paul II",
    "son of the last Emperor Bảo Đại",
    "Clerk of Courts for Middlesex County",
    "member of the Birmingham Six",
    "National Farmers' Federation Chief Executive for eight years",
    "son of Emperor Charles I of",
    "Commissioner of Social Security",
    "founder of the Popular Front for the Liberation of",
    "Director of the Office of Management and Budget",
    "last known native speaker of the Eyak language",
    "deputy head of Ingushetia Supreme Court",
    "deputy",
    "pretender to the title Emperor of",
    "founding member of Black Consciousness Movement",
    "who supported democracy",
    "last surviving son of Nelson Mandela",
    "sister of John F Kennedy",
    "confidant to Vladimir Putin",
    "envoy in",
    "Civil Rights pioneer",
    "Prince",
    "member of City Council",
    "embalmer",
    "Mohegan tribal matriarch",
    "Civil Rights Activist",
    "Activist",
    "Fianna Fáil Teachta Dála",
    "NGO founder",
    "Yang di Pertuan Agong",
    "municipal",
    "self proclaimed head of state of the Principality of Seborga micronation",
    "younger son of Ayatollah Ruhollah Khomeini and father of Hassan Khomeini",
    "member of the Senate",
    "interpreter at the Nuremberg Trials",
    "MLA for Pictou West",
    "rd Head of the Imperial Dynasty",
    "member of the Fair Play Committee",
    "opposition figure",
    "Foreign Service Officer",
    "holocaust denier",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "videographer",
    "era cultural functionary",
    "cultural functionary",
    "cultural worker",
    "horticultural",
    "CBS News sound technician",
    "motion picture special effects pioneer",
    "journalism at Columbia University",
    "Executive Editor of for years",
    "head of NBC daytime",
    "founder of Village Roadshow Limited",
    "transgender showgirl",
    "professional transsexual whistler",
    "founder of the Stratford Festival of",
    "Bass with the City Metropolitan Opera",
    "theatrical flying effects specialist",
    "sound effects creator",
    "horror  of the s",
    "Poet Laureate",
    "s on Southern cuisine",
    "TV and",
    "pantomime dame",
    "founder of ITN News at Ten",
    "and scenarist",
    "first female",
    "founder of the San Diego Comic Con",
    "saloon keeper",
    "lion tamer",
    "founder of Pacific News Service",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "bush walker who rediscovered the Takahē in",
    "sumo yokozuna",
    "yokozuna",
    "World Rally champion",
    "Mets fan known for his signs in the crowd",
    "competitive eater",
    "shortstop",
    "MLB All Star and League batting champion",
    "MLB All Star",
    "MLB umpire",
    "Baltimore Orioles",
    "NBA basketball",
    "NBA referee",
    "former National Hockey League and World Hockey Association defenseman",
    "Wimbledon winner",
    "fan of the Denver Broncos",
    "go master",
    "CEO",
    "hillclimbing competitor",
    "founder of the Peach Bowl",
    "molecular genetics",
    "first class wicketkeeper",
    "stadium groundskeeper",
    "Quizzing world champion",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "video game pioneer",
    "electronics innovator",
    "pioneer in cognitive behavioral therapy",
    "developer of Clean Language",
    "pioneer in anaesthetics",
    "specialist in infectious diseases",
    "Nobel Laureate in Physics",
    "creator of the Apple Macintosh",
    "Nobel Laureate in Chemistry",
    "natural",
    "optician and",
    "games developer",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "retail pioneer",
    "founder of Carl Jr restaurants",
    "developer active in City and Aspen",
    "City developer",
    "shopping mall developer",
    "innkeeper",
    "founder of Brookstone",
    "corporate raider",
    "prostitute",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "pioneer of Dark Age archaeology",
    "founder of the Holocaust Memorial Museum",
    "railway preservation pioneer",
    "hispanist",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "Chief of Staff of the IRA and the Official IRA",
    "member of the Provisional IRA",
    "Chief of Staff of the Defense Forces",
    "Chief of Staff of the Defence Forces",
    "Deputy Chief of Staff",
    "first Chief of Staff",
    "representative of the Abraham Lincoln Brigade",
    "FARC second in command",
    "first Defence Force service person killed in",
    "army lieutenant imprisoned by the NKVD and purported escapee",
    "army Major General",
    "army reservist",
    "army medic",
    "army major",
    "army",
    "former head of Sinn Féin at Stormont",
    "member of the Citizens' Commission to Investigate the FBI",
    "former Director of the FBI",
    "who spied for the Union during the Cold War",
    "Chief of Special Operations for the Joint Chiefs of Staff",
    "wife of Lieutenant General Hal Moore",
    "Director General of the Red Cross",
    "Director General of MI",
    "army Major General",
    "Lieutenant General",
    "General",
    "bombardier",
    "Colditz Castle escapee",
    "private first class",
    "Z Special Unit member",
    "patriot",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "ceremonial head of the Nyingma school of Buddhism",
    "Grand Master of the Order of",
    "founder of Transcendental Meditation movement",
    "former primate of the Orthodox Archdiocese of",
    "Anglican co primate",
    "s Anglican primate",
    "antipope self proclaimed Gregory XVII in",
    "zen master",
    "Orthodox Patriarch of Alexandria",
    "first hierarch of the ROCOR",
    "Syro Malabar hierarch",
    "Apostolic hierarch",
    "Maronite hierarch",
    "hierarch",
    "Samaritan High Priest",
    "Head of the Ecclesiastical Mission in of the ROCOR",
    "Lutheran",
    "Grand Mufti",
    "Hierarch in Patriarchate of Constantinople",
    "Orthodox Metropolitan of Florina",
    "Grand Marabout of the Mourides",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "founder of Children World charity",
    "founder of the Vegan Society",
    "animal sanctuary keeper",
    "hippie",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    'bandit known as "Jungle Cat"',
    "bankrobber",
    "last surviving conspirator in the assassination of Mahatma Gandhi",
    "technical mastermind of the and Bali bombings",
    "electrician and mechanic",
    "brothel keeper",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "phone phreak",
    "graduate",
    "Son of Chris Benoit and Nancy Benoit",
    "murder",
    "allegedly killed for refusing to wear hijab",
    "Student at E O Green Junior High School",
    "girl born with two heads",
    "backpacker",
    "college junior",
    "lottery winner",
    "alleged illegitimate daughter of Warren G Harding",
    "shop assistant",
    "savant",
    "municipal clerk",
    "father of Michael Jackson molestation accuser",
    "girl with rare sirenomelia condition",
    "blind mechanic",
    "housekeeper and killer",
    "World Cup oracle octopus",
    "embassy worker",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "grey parrot",
    "hybrid orangutan who learned Sign Language",
    "orangutan",
    "Siberian tiger at San Francisco Zoo",
    "Chief Mouser to the Cabinet Office",
    "Pit Bull Terrier",
    "protected lion",
    "stallion",
    "sea lion",
    "dolphin with a prosthetic tail",
    "dolphin",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [547]:
# Hard-coding cause_of_death for entry with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Jacob_Kovco"].index
df.loc[index, "cause_of_death"] = "firearm accident"

# Dropping entry with link that points to event page rather than individual page
index = df[df["link"] == "https://en.wikipedia.org/wiki/Thor_Hesla"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

# Dropping entry with link that points to husband's page rather than individual's page
index = df[df["link"] == "https://en.wikipedia.org/wiki/Don_Geronimo#Family"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [548]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [None]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "general" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)  

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean13.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean13.db")
# df.to_sql("wp_life_expect_clean13", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 14]()