# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 6](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean6_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_clean5.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean5.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean5", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98041 rows and 44 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,ballet designer,director,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,writer,,,and academic,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98039,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",50.0,,Pakistan,,"2002 2007, since 2018",4.60517,MNA,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,1,0,0,0,2
98040,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,member of the Academy of Engineering,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
34675,22,Bob Gould,", 74, Australian activist and bookseller.",https://en.wikipedia.org/wiki/Bob_Gould_(activist),15,2011,May,,74.0,,Australia,,,2.772589,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,1,0,0,0,2
74873,20,Mira Zakai,", 76, Israeli opera singer, complications from a stroke.",https://en.wikipedia.org/wiki/Mira_Zakai,7,2019,May,,76.0,,Israel,,,2.079442,complications from a stroke,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
66363,2,Mundell Lowe,", 95, American jazz guitarist and composer.",https://en.wikipedia.org/wiki/Mundell_Lowe,6,2017,December,,95.0,,United States of America,,,1.94591,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
55769,26,Jerrold Kemp,", 94, American academic.",https://en.wikipedia.org/wiki/Jerrold_Kemp,12,2015,November,,94.0,,United States of America,,,2.564949,,,,,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,0,0,1
82010,2,Jacques Noyer,", 93, French Roman Catholic prelate, Bishop of Amiens .",https://en.wikipedia.org/wiki/Jacques_Noyer,5,2020,June,,93.0,,France,Italy,1987 2003,1.791759,Bishop of Amiens,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98041 entries, 0 to 98040
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98041 non-null  object 
 1   name                       98041 non-null  object 
 2   info                       98041 non-null  object 
 3   link                       98041 non-null  object 
 4   num_references             98041 non-null  int64  
 5   year                       98041 non-null  int64  
 6   month                      98041 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98041 non-null  float64
 9   cause_of_death             17 non-null     object 
 10  place_1                    97888 non-null  object 
 11  place_2                    8115 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98041 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict` and starting the search of `info_3` columns.
- We will need to adjust our approach at this step, in order to also capture `cause_of_death` values.
- Prior to this point we have hard-coded the few `cause_of_death` values encountered, but we expect a much higher proportion of them for the remaining numbered columns.
- We will add a new list `cause_of_death` to collect those values and we will add a separate loop to extract them to the `cause_of_death` column.

### Extracting Remaining `known_for` and `cause_of_death` Values

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [6]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "shot" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "shot Eddie Waitkus"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [11]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "campaigner for breast cancer drug Herceptin",
    "MPP of the Legislative Assembly of for Beaches—Woodbine",
    "MP for Secretary General of the Council of and",
    "MPP of the Legislature for Waterloo North",
    "MP of the Karnataka Legislative Assembly",
    "MP of the Odisha Legislative Assembly",
    "MP for Medicine Hat—Cardston—Warner",
    "MP for Newcastle upon Tyne Central",
    "MP for Newcastle upon Tyne North",
    "MP for Perth—Wellington—Waterloo",
    "MP for North West Leicestershire",
    "MP for Northumberland—Miramichi",
    "MP for Wolverhampton South East",
    "MP for Newcastle upon Tyne East",
    "MP for South East St Elizabeth",
    "MP for North Central Clarendon",
    "MP for Manchester Wythenshawe",
    "first honorary MPLA President",
    "MP for Movement of the People",
    "MP for Beauharnois—Salaberry",
    "MP for Birmingham Perry Barr",
    "MP for Birmingham Handsworth",
    "MP for Birmingham Hall Green",
    "MP for Scarborough—Agincourt",
    "MP for Northumberland—Durham",
    "MP for Battleford—Kindersley",
    "MP for Birmingham Northfield",
    "MP for Sheffield Brightside",
    "MP for Leicester North West",
    "MP for City of Buenos Aires",
    "Queensland MP for Mackenzie",
    "MP for Amman fifth district",
    "MP for Stoke on Trent South",
    "MP for Leicester South East",
    "MP for Manchester Blackley",
    "MP for Windsor—Walkerville",
    "MP for Carleton Gloucester",
    "MP for Manchester Openshaw",
    "MP for Coventry North East",
    "MP for Croydon North West",
    "MP for East Aberdeenshire",
    "MP for Vancouver Kingsway",
    "MP for Birmingham Yardley",
    "MP for Nottingham Central",
    "MP for Afram Plains North",
    "MP for Bristol North West",
    "MP for Wandsworth Central",
    "MP for West Aberdeenshire",
    "MP for Manchester Ardwick",
    "MP for Manchester Central",
    "Conservative MP for South",
    "MP for Glasgow Queen Park",
    "MP for Weston super Mare",
    "MP for East Renfrewshire",
    "MP for Ambunti Dreikikir",
    "MP for Barrow in Furness",
    "MP for Mission—Coquitlam",
    "federal MP for St George",
    "MP for Belfast Shankill",
    "MPP for Dufferin Simcoe",
    "MPP for Wentworth North",
    "MP for Sheffield Heeley",
    "MP for Glasgow Cathcart",
    "MP for Clwyd North West",
    "co developer of the IMP",
    "MP for Paddington North",
    "MP for Bouches du Rhône",
    "MP for Grenville—Dundas",
    "MP for Bournemouth East",
    "MP for Stockholm County",
    "MP for Southampton West",
    "MP for Glenrothes since",
    "MP for Mid Bedfordshire",
    "MP for Brome—Missisquoi",
    "former MPP for Kingston",
    "MP for South Manchester",
    "MP for Sheffield Hallam",
    "MP for Naogaon District",
    "MP for Hemel Hempstead",
    "Khyber Pakhtunkhwa MPA",
    "MP for South West Nova",
    "MP for Bury St Edmunds",
    "MP for Bishop Auckland",
    "MP for Nottingham West",
    "MP for Ottawa—Carleton",
    "MP for Montgomeryshire",
    "MP for Coimbatore East",
    "MP for Kingsford Smith",
    "MP for The Battlefords",
    "former Labour Party MP",
    "MP for Blackpool South",
    "MP for Leeds—Grenville",
    "MP for Sault Ste Marie",
    "MP for Central Honiara",
    "NSW MP for Burrinjuck",
    "MP for Wellingborough",
    "MP for City of Durham",
    "MP for North Cornwall",
    "MP for Sarnia—Lambton",
    "MPP for Niagara Falls",
    "MP for Hatay Province",
    "MP for Aberdeen South",
    "MP for Knowsley South",
    "MPP of from Cambridge",
    "MP for Stockton North",
    "MP for Winnipeg South",
    "MP for Dalarna County",
    "MP for Brandon—Souris",
    "MP for Glasgow Pollok",
    "MPP for Ottawa Rideau",
    "speaker of MPR DPR in",
    "MP for Uppsala County",
    "MP for Berettyóújfalu",
    "MP for Glasgow Provan",
    "MP for Ikaroa Rāwhiti",
    "MP for Bradford West",
    "MP for Hamilton West",
    "Labour Party list MP",
    "MP for Otago Central",
    "MP for Ottawa—Vanier",
    "MP for Dunedin North",
    "MP for Pas de Calais",
    "MP for Kajiado North",
    "MP for Lewisham West",
    "Labour MP for Newark",
    "MP for Edmonton East",
    "MP for Middlesbrough",
    "MP for Glasgow Govan",
    "MP for North Malaita",
    "MP for Chennai North",
    "MPP for Durham West",
    "MP for South Antrim",
    "MP for Vaipae Tautu",
    "MP for Swansea West",
    "MP for Newport West",
    "MP for North Sydney",
    "NSW MP for Corrimal",
    "MP for Invercargill",
    "MP for Western Hutt",
    "MP for West Lothian",
    "MP for Cardiff West",
    "MP for North Imenti",
    "MPP for Essex South",
    "MP for Rowley Regis",
    "MP for Gelang Patah",
    "MP for Guruve South",
    "MP for Gainsborough",
    "MP for Basingstoke",
    "MP for Cooch Behar",
    "MP for North Shore",
    "MP for Eden Monaro",
    "MP for Oldham West",
    "MP for West Dorset",
    "MP for Temotu Pele",
    "MP for Maharashtra",
    "head of the UOC MP",
    "MP for Capricornia",
    "MP from Saint John",
    "MP for Londonderry",
    "MP for Regina East",
    "MP for West Tyrone",
    "MP for Mount Royal",
    "MP for Harrow West",
    "MP for Grey—Simcoe",
    "MP for Port Arthur",
    "MP for Bexleyheath",
    "MP for Marijampolė",
    "MP for Fredericton",
    "MP for Hull—Aylmer",
    "MP for Wythenshawe",
    "MP for Clackmannan",
    "President of AMPAS",
    "MP for North Devon",
    "MP for Essex South",
    "MP for Banaskantha",
    "MP for Billericay",
    "MP for Guanajuato",
    "MP for Banffshire",
    "MP for Eastbourne",
    "MP for West Derby",
    "MP for Heretaunga",
    "Northern Cape MPL",
    "MP for Repentigny",
    "MP for Bromsgrove",
    "MP for Accrington",
    "MP for Island Bay",
    "MP for Clydesdale",
    "MP for The Wrekin",
    "MP for Vijayawada",
    "MP for Leominster",
    "MP for Kilmarnock",
    "MP for Hull North",
    "MP for Carshalton",
    "MP for Midlothian",
    "MP for York North",
    "MP for Rushcliffe",
    "MPP for Bellwoods",
    "MP for South Down",
    "MP for Mahasamund",
    "MP for Deggendorf",
    "MP for Buckingham",
    "MP for Coimbatore",
    "MP for Nuevo León",
    "MP for Charlevoix",
    "MP for Srikakulam",
    "MP for Pontefract",
    "MP for Winchester",
    "MPP for York East",
    "MP for Brentford",
    "MP for Wairarapa",
    "MP for Lyttelton",
    "MP for Faversham",
    "MP for Warringah",
    "MP for Cambridge",
    "MP for Orpington",
    "MP for Rochester",
    "MP for Frontenac",
    "MP for Hampstead",
    "MP for Szigetvár",
    "MP for Tongariro",
    "MP for Churchill",
    "MP for Easington",
    "MP for Kaohsiung",
    "MP for Brighouse",
    "MP for Greenwich",
    "MP for Tongatapu",
    "MP for Robertson",
    "MP for Waitakere",
    "MP for Pencarrow",
    "MP for Worcester",
    "MP for Stretford",
    "MP for Smethwick",
    "MP for Tottenham",
    "MP for Nagercoil",
    "MP for Nizamabad",
    "MP for Lancaster",
    "MP for Cuddalore",
    "MP for Ashburton",
    "MP for Wakefield",
    "MP for Penistone",
    "MP for Saarlouis",
    "MP for Mt Albert",
    "MP for Hastings",
    "MP for Richmond",
    "MP for Barnsley",
    "MP for Coahuila",
    "MP for La Trobe",
    "MP for Wide Bay",
    "MP for Falmouth",
    "MP for McMillan",
    "MP for Galloway",
    "MP for Värmland",
    "Balochistan MPA",
    "MP for Waterloo",
    "MP for Solihull",
    "MP for Hereford",
    "MP for Greenock",
    "MP for Hyndburn",
    "MP for El Koura",
    "MP for Palliser",
    "MP for Finchley",
    "MP for Tiverton",
    "MP for Sherwood",
    "MP for Bilaspur",
    "MP for Jelutong",
    "MP for Fallujah",
    "MP for Keighley",
    "MP for Chittoor",
    "MPP for Welland",
    "MP for Heywood",
    "MP for Dalarna",
    "MP for Denison",
    "MP for Badulla",
    "MP for Lasalle",
    "MP for Jalisco",
    "MP for Makueni",
    "MP for Bangaon",
    "MP for Western",
    "MP for Entally",
    "MP for Nyakach",
    "MP for Newbury",
    "MP for Taunton",
    "MP for Dum Dum",
    "MP for Feltham",
    "MP for Mitcham",
    "MP for Wallace",
    "MP of Northern",
    "independent MP",
    "MP for Blaydon",
    "MP for Trinity",
    "MP for Consett",
    "MP for Romford",
    "MP for Spadina",
    "MP for Mercier",
    "MP for Sudbury",
    "MP for Ipswich",
    "MP for Oxford",
    "Queensland MP",
    "MP for Jorhat",
    "MP for Ukonga",
    "MP for Hughes",
    "MP for Dungun",
    "MP for Rompin",
    "MPP for South",
    "MP for Dawson",
    "MP for Argyll",
    "three time MP",
    "MP for Halton",
    "MP of Gujarat",
    "MP for Kabete",
    "MP for Belper",
    "MP for Oaxaca",
    "MP for Melton",
    "MP for Boston",
    "MP for Conway",
    "MP for Mannar",
    "MP for Kigoma",
    "MP for Khulna",
    "MP for Araria",
    "MP for Batley",
    "MP for Leyton",
    "MP for Bowman",
    "MP for Kenema",
    "MP for Jarrow",
    "MP for Ngella",
    "MP for Maldon",
    "MP for Gwydir",
    "MP for Butere",
    "MP for Ndhiwa",
    "MP for Brecon",
    "MP for Fraser",
    "MP for Hunter",
    "MP for Heston",
    "MP for Athens",
    "MP for Viborg",
    "MP for Tumkur",
    "MP for Dudley",
    "MP for Kapiti",
    "MP for Howrah",
    "MP for Napier",
    "MP for Henty",
    "MP for Poole",
    "MP for Geita",
    "New South MP",
    "MP for Brant",
    "MP for Ranau",
    "MP for Waipa",
    "MP for Alwar",
    "MP for Acton",
    "MP for Perth",
    "MP for Truro",
    "MP for Royal",
    "MP for Udupi",
    "MP for Wells",
    "MP for Conwy",
    "MP for Elgin",
    "MP for Nketa",
    "MP for Monor",
    "five time MP",
    "MP for Swan",
    "MP for Swat",
    "MP of Sabah",
    "MP for Sibu",
    "Gauteng MPL",
    "MP for Raub",
    "MP for Hutt",
    "MP for Lowe",
    "Liberal MPP",
    "MP for Aska",
    "MP for York",
    "MP for Buem",
    "Punjab MPA",
    "MP for Bow",
    "MP for Ayr",
    "Sindh MPA",
    "List MP",
    "NSW MP",
    "MPP of",
    "MP for",
    "MPP",
    "MPA",
    "MPL",
    "MP",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "shot Andy Warhol Shot Marilyns paintings",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "and Olympic shot putter",
    "shot putter",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "known as one of the world leading authorities on cancer research",
    "co developer of ultrasound use in cancer detection",
    "treated herself for breast cancer on Antarctica in",
    "co discoverer of drugs that fight cancer",
    "pioneer in breast cancer treatment",
    "expert in breast cancer treatment",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "pediatric cancer advocate",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "shot Eddie Waitkus",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "stroke following decade long battle with breast cancer",
    "died during treatment for testicular cancer in",
    "kidney failure associated with colon cancer",
    "complications of treatment for lung cancer",
    "pneumonia as a complication of lung cancer",
    "meningitis complicated from breast cancer",
    "complications from kidney cancer surgery",
    "complications of breast cancer treatment",
    "complications of prostate cancer surgery",
    "complications from bowel cancer surgery",
    "complications from gall bladder cancer",
    "pancreatic cancer",
    "heart attack due to pancreatic cancer",
    "complications from bone marrow cancer",
    "pneumonia as a complication of cancer",
    "complication following cancer surgery",
    "complications from esophageal cancer",
    "complications from pancreatic cancer",
    "complications from colorectal cancer",
    "heart attack during cancer treatment",
    "complications from bile duct cancer",
    "complications from prostate cancer",
    "cardiac arrest due to colon cancer",
    "as a complication of breast cancer",
    "complications of colorectal cancer",
    "complications of pancreatic cancer",
    "complications from bladder cancer",
    "complications from stomach cancer",
    "complications from cancer surgery",
    "liver cancer complicated by COVID",
    "complications from throat cancer",
    "complications from breast cancer",
    "complications of prostate cancer",
    "kidney complications from cancer",
    "complications from liver cancer",
    "complications from colon cancer",
    "complications from brain cancer",
    "complications of ovarian cancer",
    "euthanised for abdominal cancer",
    "complications from lung cancer",
    "complications of breast cancer",
    "complications from skin cancer",
    "complications from oral cancer",
    "complications of brain cancer",
    "complications of liver cancer",
    "complications of colon cancer",
    "complications of lung cancer",
    "metastatic pancreatic cancer",
    "metastatic esophageal cancer",
    "throat cancer",
    "complications from cancer",
    "anaplastic thyroid cancer",
    "urothelial bladder cancer",
    "metastasized liver cancer",
    "metastatic breast cancer",
    "medullary thyroid cancer",
    "euthanized due to cancer",
    "cancer related pneumonia",
    "complications of cancer",
    "metastatic colon cancer",
    "gastrointestinal cancer",
    "small cell lung cancer",
    "small intestine cancer",
    "neuroendrocrine cancer",
    "cancer related illness",
    "salivary gland cancer",
    "neuroendocrine cancer",
    "hepatocellular cancer",
    "metastatic eye cancer",
    "cancerous peritonitis",
    "periampullary cancer",
    "spinal fluid cancer",
    "cancer of the spine",
    "renal pelvic cancer",
    "bone marrow cancer",
    "gallbladder cancer",
    "oesophageal cancer",
    "endometrial cancer",
    "male breast cancer",
    "nasopharynx cancer",
    "pancreatic cancer",
    "esophageal cancer",
    "colorectal cancer",
    "intestinal cancer",
    "testicular cancer",
    "peritoneal cancer",
    "pharyngeal cancer",
    "metastatic cancer",
    "mandibular cancer",
    "Leka Zogu; cancer",
    "bile duct cancer",
    "laryngeal cancer",
    "abdominal cancer",
    "lymphatic cancer",
    "esophagus cancer",
    "carcinoid cancer",
    "ampullary cancer",
    "prostate cancer",
    "cervical cancer",
    "appendix cancer",
    "pancreas cancer",
    "lymphoma cancer",
    "thoracic cancer",
    "urethral cancer",
    "stomach cancer",
    "ovarian cancer",
    "bladder cancer",
    "uterine cancer",
    "thyroid cancer",
    "adrenal cancer",
    "gastric cancer",
    "myeloma cancer",
    "vaginal cancer",
    "of lung cancer",
    "throat cancer",
    "kidney cancer",
    "spinal cancer",
    "tongue cancer",
    "tonsil cancer",
    "thymic cancer",
    "rectal cancer",
    "vulvar cancer",
    "thymus cancer",
    "pelvic cancer",
    "liver cancer",
    "brain cancer",
    "colon cancer",
    "bowel cancer",
    "blood cancer",
    "renal cancer",
    "heart cancer",
    "sinus cancer",
    "mouth cancer",
    "spine cancer",
    "Liver cancer",
    "lung cancer",
    "bone cancer",
    "skin cancer",
    "oral cancer",
    "anal cancer",
    "nose cancer",
    "heart attack following a cerebral haemorrhage",
    "heart attack during Olympic marathon trials",
    "suspected heart attack while hillwalking",
    "heart attack caused by anorexia nervosa",
    "cardiac arrest following a heart attack",
    "complications following a heart attack",
    "heart attack following a hunger strike",
    "heart attack brought about by diabetes",
    "heart attack caused by a drug overdose",
    "suffered a heart attack while swimming",
    "complications following a\xa0heart attack",
    "heart attack following spinal surgery",
    "heart attack due to pancreatic cancer",
    "apparent heart attack while teaching",
    "heart failure following heart attack",
    "brain haemorrhage after heart attack",
    "heart attack during cancer treatment",
    "complications from a heart attack",
    "heart attack triggered by asthma",
    "complications from heart attack",
    "complications of a heart attack",
    "heart attack following beating",
    "complications of heart attack",
    "series of small heart attacks",
    "heart attack aboard aircraft",
    "heart attack as a result of",
    "heart attack while jogging",
    "post surgery heart attack",
    "heart attack after race",
    "suspected heart attack",
    "apparent heart attack",
    "probable heart attack",
    "possible heart attack",
    "heart attack",
    "heart failure resulting from hypertrophic cardiomyopathy",
    "heart failure as a complication from cardiac surgery",
    "heart failure after surgery following a knockout",
    "heart failure after a botched suicide attempt",
    'known as "Crazy" Luke Graham; heart failure',
    "complications from congestive heart failure",
    "heart failure following accidental overdose",
    "heart failure caused by anorexia nervosa",
    "heart failure due to pulmonary embolism",
    "heart failure due to cardiac arrhythmia",
    "complications related to heart failure",
    "heart failure following heart attack",
    "pneumonia congestive heart failure",
    "heart failure related to pneumonia",
    "heart failure due to polymyositis",
    "complications from heart failure",
    "complications of heart failure",
    "progeria related heart failure",
    "hypertensive heart failure",
    "heart failure from sepsis",
    "congestive heart failure",
    "suspected heart failure",
    "apparent heart failure",
    "acute heart failure",
    "dheart failure",
    "heart failure",
    "liver failure reportedly complicated by COVID",
    "pulmonary fibrosis complicated by COVID",
    "cardiovascular illness related to COVID",
    "Parkinson disease complicated by COVID",
    "multiple organ failure caused by COVID",
    "kidney problems aggravated by COVID",
    "renal failure complicated by COVID",
    "kidney failure brought on by COVID",
    "sepsis as a complication of COVID",
    "long illness complicated by COVID",
    "liver cancer complicated by COVID",
    "pneumonia complicated by COVID",
    "heart complications from COVID",
    "viral pneumonia from COVID",
    "cardiac arrest from COVID",
    "complications from COVID",
    "COVID related pneumonia",
    "complications of COVID",
    "post COVID pneumonia",
    "pneumonia from COVID",
    "COVID",
    "hospitalized with pneumonia since his evacuation several days after",
    "pneumonia as a complication from a kidney infection",
    "pneumonia with complications from Alzheimer disease",
    "pneumonia as a complication of multiple myeloma",
    "pneumonia induced corticobasal degeneration",
    "pneumonia as a complication of lung cancer",
    "pneumonia as a complication from a stroke",
    "complications of pneumonia from surgery",
    "pneumonia as a complication of a stroke",
    "pneumonia as a complication of cancer",
    "bronchopneumonia following a stroke",
    "heart failure related to pneumonia",
    "pneumonia congestive heart failure",
    "respiratory failure from pneumonia",
    "complications following pneumonia",
    "complications of viral pneumonia",
    "pneumonia complicated by COVID",
    "complications from pneumonia",
    "pneumonia following a stroke",
    "complication from pneumonia",
    "complications of pneumonia",
    "viral pneumonia from COVID",
    "pneumonia related illness",
    "cancer related pneumonia",
    "COVID related pneumonia",
    "AIDS related pneumonia",
    "interstitial pneumonia",
    "aspiration pneumonia",
    "pneumonia from COVID",
    "hypostatic pneumonia",
    "post COVID pneumonia",
    "bronchial pneumonia",
    "bilateral pneumonia",
    "pleural pneumonia",
    "bronchopneumonia",
    "double pneumonia",
    "pneumonia during",
    "viral pneumonia",
    "acute pneumonia",
    "pneumonia",
    "stroke following decade long battle with breast cancer",
    "stroke as a complication of an aortic aneurysm",
    "cardiac arrest as a complication from a stroke",
    "stroke during treatment of pulmonary embolism",
    "stroke related to acute myelogenous leukemia",
    "stroke as a complication from heart surgery",
    "pneumonia as a complication from a stroke",
    "stroke from vertebral artery dissection",
    "pneumonia as a complication of a stroke",
    "complications from a series of strokes",
    "complications following a heat stroke",
    "complications of a stroke suffered in",
    "complications from multiple strokes",
    "bronchopneumonia following a stroke",
    "declining health following stroke",
    "complications following a stroke",
    "complications of massive stroke",
    "complications following strokes",
    "complications from heat stroke",
    "complications from heatstroke",
    "following a series of strokes",
    "cerebral atrophy from stroke",
    "pneumonia following a stroke",
    "complications after a stroke",
    "complications from a stroke",
    "aneurysm following a stroke",
    "complications from strokes",
    "complications of a stroke",
    "complications from stroke",
    "complications of strokes",
    "hemorrhagic brain stroke",
    "complications of stroke",
    "stroke following a fall",
    "consequences of stroke",
    "stroke complications",
    "died of a stroke in",
    "hemorrhagic stroke",
    "series of strokes",
    "multiple strokes",
    "watershed stroke",
    "cerebral stroke",
    "heat stroke",
    "heatstroke",
    "strokes",
    "stroke",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [12]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [13]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking remaining number of missing cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 10084 values in cause_of_death column.

CPU times: total: 3min 31s
Wall time: 3min 31s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [14]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    88120
2     9682
3      194
0       45
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [15]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [16]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [17]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "Olymp" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [18]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [19]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "Mr Olympia"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [20]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "member of the House of Representatives from Massachusetts th congressional district",
    "member of the House of Representatives from South Dakota nd congressional district",
    "member of the House of Representatives from Wisconsin rd congressional district",
    "member of the House of Representatives from Illinois rd congressional district",
    "member of the House of Representatives from Arkansas th congressional district",
    "member of the House of Representatives from Michigan rd congressional district",
    "member of the House of Representatives from Illinois th congressional district",
    "member of the House of Representatives for Wisconsin th congressional district",
    "member of the House of Representatives from Maryland th congressional district",
    "member of the House of Representatives from Indiana th congressional district",
    "Member of the House of Representatives from Indiana th congressional district",
    "member of the House of Representatives for Alabama th congressional district",
    "member of the House of Representatives from Ohio th congressional district",
    "member of the House of Representatives for Idaho nd congressional district",
    "member of the House of Representatives from Utah st congressional district",
    "having served as a Labour House of Representatives of Member of Parliament",
    "member of the House of Representatives from th congressional district",
    "member of the House of Representatives from st congressional district",
    "member of the House of Representatives for North Carolina th district",
    "Member of the House of Representatives from th congressional district",
    "House of Representatives from Wyoming at large congressional district",
    "member of the House of Representatives for th congressional district",
    "member of the House of Representatives for West Virginia rd district",
    "member of the House of Representatives for Vermont at large district",
    "member of the House of Representatives from Tennessee th district",
    "member of the House of Representatives from Virginia th district",
    "member of the House of Representatives from Arkansas rd district",
    "member of the House of Representatives for Minnesota th district",
    "member of the House of Representatives for Minnesota th District",
    "member of the House of Representatives for Maryland nd district",
    "member of the House of Representatives for Missouri th district",
    "Permanent Representative to the United Nations Office at Geneva",
    "member of the House of Representatives for Illinois nd district",
    "member of the House of Representatives from Montana nd district",
    "member of the House of Representatives for Maryland th District",
    "member of the House of Representatives from Hawaii nd district",
    "member of the House of Representatives for Alabama th district",
    "member of the House of Representatives from Nevada nd district",
    "four time member of the New Hampshire House of Representatives",
    "member of the House of Representatives for Hawaii st district",
    "longest serving female member of the House of Representatives",
    "member of the Jakarta Regional People Representative Council",
    "member of the House of Representatives from Iowa rd district",
    "member of the House of Representatives for Ohio th district",
    "Representative from West Virginia th congressional district",
    "member of the House of Representatives from New th district",
    "member of the House of Representatives for the st District",
    "member of the House of Representatives for New th district",
    "member of the House of Representatives from th districts",
    "Representative from the th District of Negros Occidental",
    "member of the House of Representatives from th district",
    "member of the House of Representatives from Michigan th",
    "member of the House of Representatives from Arkansas th",
    "member of the House of Representatives for Wisconsin th",
    "member of the House of Representatives from Illinois nd",
    "Representative from Tennessee rd congressional district",
    "member of the House of Representatives from Connecticut",
    "member of the House of Representatives for th district",
    "member of House of Representatives for the st district",
    "member of the House of Representatives from Arizona nd",
    "former Republican Representative from Illinois from to",
    "former Republican Representative from Michigan from to",
    "member of the House of Representatives for st district",
    "member of the North Carolina House of Representatives",
    "member of the House of Representatives from Minnesota",
    "Member of the North Carolina House of Representatives",
    "former Democratic Representative from Washington from",
    "former Republican Representative from Colorado served",
    "Speaker of the Massachusetts House of Representatives",
    "member of the Massachusetts House of Representatives",
    "member of the New Hampshire House of Representatives",
    "member of the House of Representatives from Colorado",
    "member of the House of Representatives of the Senate",
    "member of the House of Representatives for St George",
    "member of the House of Representatives from Michigan",
    "member of the House of Representatives from Missouri",
    "member of the House of Representatives from Illinois",
    "member of the South Dakota House of Representatives",
    "member of the North Dakota House of Representatives",
    "member of the House of Representatives from Indiana",
    "Member of the South Dakota House of Representatives",
    "former Republican Representative from Maine from to",
    "member of the House of Representatives for Illinois",
    "member of the Connecticut House of Representatives",
    "member of the House of Representatives for Gilmore",
    "member of the House of Representatives from Oregon",
    "member of the House of Representatives for Batanes",
    "member of the House of Representatives for Ohio st",
    "delegate to the House of Representatives from Guam",
    "member of the House of Representatives for Forrest",
    "Permanent Representative to the Economic Community",
    "member of the House of Representatives for Berowra",
    "member of the House of Representatives for Phillip",
    "member of the Washington House of Representatives",
    "member of House of Representatives from Wisconsin",
    "member of the House of Representatives for Cowper",
    "member of the House of Representatives for Sydney",
    "member of the House of Representatives for Oregon",
    "speaker of the Tennessee House of Representatives",
    "Speaker of the Tennessee House of Representatives",
    "member of North Carolina House of Representatives",
    "member of the Minnesota House of Representatives",
    "member of the Tennessee House of Representatives",
    "member of the Missouri Houses of Representatives",
    "member of the House of Representatives from Ohio",
    "member of the House of Representatives from Utah",
    "Speaker of the Missouri House of Representatives",
    "member of the House of Representatives for Bohol",
    "member of the Illinois House of Representatives",
    "member of the Missouri House of Representatives",
    "member of the Michigan House of Representatives",
    "member of the Arkansas House of Representatives",
    "member of the Oklahoma House of Representatives",
    "member of the Colorado House of Representatives",
    "member of the Delaware House of Representatives",
    "Senator of the Indiana House of Representatives",
    "Permanent Representative of Polisario to the UN",
    "member of the House of Representatives for Reid",
    "member of the House of Representatives from New",
    "former liberal Democrat Representative from and",
    "former Republican Representative from Iowa from",
    "member of the House of Representatives for Ohio",
    "last Doorkeeper of the House of Representatives",
    "member of South Dakota House of Representatives",
    "member of the Alabama House of Representatives",
    "member of the Wyoming House of Representatives",
    "member of the Arizona House of Representatives",
    "member of the Indiana House of Representatives",
    "member of the Vermont House of Representatives",
    "member of the Montana House of Representatives",
    "member of the House of Representatives from th",
    "Permanent Representative to the United Nations",
    "Speaker of the Hawaii House of Representatives",
    "senator of the Alaska House of Representatives",
    "Member of the Vermont House of Representatives",
    "member of the House of Representatives from rd",
    "Member of the House of Representatives from th",
    "first Delegate to the House of Representatives",
    "former Democratic Representative from Michigan",
    "Speaker of the Alaska House of Representatives",
    "member of the Alaska House of Representatives",
    "member of the Kansas House of Representatives",
    "member of the Oregon House of Representatives",
    "member of the Hawaii House of Representatives",
    "member of the House of Representatives for th",
    "former member of the House of Representatives",
    "member of the Nevada House of Representatives",
    "Member of the Hawaii House of Representatives",
    "member of the Idaho House of Representatives",
    "member of the Maine House of Representatives",
    "Member of Minnesota House of Representatives",
    "member of the House of Representatives since",
    "Democratic Congressional Representative from",
    "Speaker of the Utah House of Representatives",
    "member of the Ohio House of Representatives",
    "member of the Iowa House of Representatives",
    "member of the Utah House of Representatives",
    "member of the House of Representatives from",
    "Member of the Ohio House of Representatives",
    "member of the People Representative Council",
    "member of Oklahoma House of Representatives",
    "husband of Representative Carolyn B Maloney",
    "member of the New House of Representatives",
    "member of the P R House of Representatives",
    "former Republican Representative from from",
    "member of the House of Representatives for",
    "member of Wyoming House of Representatives",
    "president of the House of Representatives",
    "member of Hawaii House of Representatives",
    "Representative from Minnesota th district",
    "member of the House of Representatives of",
    "Historian of the House of Representatives",
    "member of the Chamber of Representatives",
    "member of the Council of Representatives",
    "member of Maine House of Representatives",
    "Representative from Illinois th district",
    "Delegate to the House of Representatives",
    "first Permanent Representative to the UN",
    "Speaker of the House of Representatives",
    "speaker of the House of Representatives",
    "member of the Houses of Representatives",
    "member of House of Representatives from",
    "member of the House of Representatives",
    "Member of the House of Representatives",
    "Massachusetts House of Representatives",
    "former Democratic Representative from",
    "former Republican Representative from",
    "Speaker of House of Representatives",
    "member of House of Representatives",
    "Representative from North Carolina",
    "Tennessee House of Representatives",
    "Representative from New Hampshire",
    "Permanent Representative to NATO",
    "Representative from North Dakota",
    "Representative from South Dakota",
    "Representative from Connecticut",
    "Representative for Connecticut",
    "Republican Representative from",
    "Representative from Wisconsin",
    "Utah House of Representatives",
    "Representative from Tennessee",
    "Arkansas State Representative",
    "Colorado State Representative",
    "Representative from Illinois",
    "Representative from Virginia",
    "Representative from Michigan",
    "Representative from Missouri",
    "Representative from Maryland",
    "Representative from Indiana",
    "Representative Co Prince of",
    "Representative for Michigan",
    "Representative from Arizona",
    "Representative from Alabama",
    "Representative from Vermont",
    "Kansas State Representative",
    "Representative for Virginia",
    "Representative from Hawaii",
    "Representative from Alaska",
    "Representative from Oregon",
    "Representative from Kansas",
    "Representative from Maine",
    "Representative from Ohio",
    "Representative from Iowa",
    "Representative from New",
    "Representative for Ohio",
    "Representative for Utah",
    "Representative for Iowa",
    "Arkansas Representative",
    "State Representative",
    "Representative elect",
    "Trade Representative",
    "Queen Representative",
    "Representative from",
    "Representative for",
    "Representative",
    "co writer of the Endangered Species Act of",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "first female writer awarded full press credentials at",
    "member of the Nashville Songwriters Hall of Fame",
    "a prolific writer of language text books",
    "widow of science fiction writer",
    "award winning Ontarian writer",
    "and television screenwriter",
    "presidential speechwriter",
    "speech writer",
    "detective story writer",
    "songwriter arranger",
    "nationalized writer",
    "short story writer",
    "esotericism writer",
    "non‑fiction writer",
    "singer songwriter",
    "television writer",
    "songwriter winner",
    "technical writer",
    "cookbook writer",
    "children writer",
    "dialogue writer",
    "fiction writer",
    "and songwriter",
    "travel writer",
    "script writer",
    "comics writer",
    "comedy writer",
    "screenwriter",
    "scriptwriter",
    "music writer",
    "story writer",
    "food writer",
    "songwriter",
    "and writer",
    "art writer",
    "Director General of the Olympia",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "only Olympian handball player to represent three countries",
    "won the gold medal in hurdles at the Summer Olympics",
    "winner of seven Olympic gold medals for the Union",
    "president of the International Olympic Committee",
    "bronze medalist in the Summer Olympics marathon",
    "International Olympic Committee whistleblower",
    "first black woman to become Olympic champion",
    "national team member for the Winter Olympics",
    "President of the National Olympic Committee",
    "most medaled athlete at the Summer Olympics",
    "gold medallist in the Winter Olympic Games",
    "winner of the Olympic demonstration event",
    "led push to return tennis to Olympics in",
    "represented in soccer at Summer Olympics",
    "brother of Olympic Judo star Neil Adams",
    "silver medallist at the Summer Olympics",
    "silver medalist at the Summer Olympics",
    "gold medallist at the Summer Olympics",
    "Olympic gold medallist for the Union",
    "lit cauldron at the Summer Olympics",
    "president of Olympique de Marseille",
    "president of the Olympic Committee",
    "represented in six Chess Olympiads",
    "longest living Olympic competitor",
    "Chairman of the Olympic Committee",
    "gold medallist at Summer Olympics",
    "oldest known Olympic medal winner",
    "two time Olympic silver medalist",
    "Olympic gold medal winning boxer",
    "double medallist at the Olympics",
    "Olympics high jump gold medalist",
    "Winter Olympics silver medalist",
    "three time Olympic medal winner",
    "founder of the Special Olympics",
    "four time Olympic gold medalist",
    "Summer Olympics silver medalist",
    "president of Olympic Committee",
    "Olympic double silver medalist",
    "oldest living Olympic champion",
    "first female Olympic champion",
    "seven time Olympic competitor",
    "Youth Olympic silver medalist",
    "Olympic middleweight champion",
    "Olympic bronze medal winner",
    "three time Olympic champion",
    "Olympic silver medal winner",
    "twice Olympic gold medalist",
    "Olympic champion in javelin",
    "Olympic champion under \xa0kg",
    "first Olympic medal winner",
    "multiple Olympic medallist",
    "four time Olympic champion",
    "two time Olympic champion",
    "fourfold Olympic champion",
    "Olympic bronze medallist",
    "Olympic silver medallist",
    "Olympic silver medalist",
    "Olympic bronze medalist",
    "double Olympic champion",
    "triple Olympic champion",
    "Olympic gold medallist",
    "member of Olympic team",
    "Olympic sports shooter",
    "Youth Olympic champion",
    "Olympics gold medalist",
    "Olympic relay champion",
    "Olympic gold medalist",
    "Olympic wrestler for",
    "Olympic silver medal",
    "five time Olympian",
    "Olympic medallist",
    "Olympic champion",
    "Olympic medalist",
    "Olympic Champion",
    "Olympic silver",
    "triple Olympic",
    "Olympic gold",
    "Mr Olympia",
    "Olympian",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "inventor of the implantable cardiac pacemaker",
    "developed the Rho immune globulin vaccine for Rh disease",
    "discoverer of Kawasaki disease",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "parachutist at Summer Olympics opening ceremony",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "rescued people from suicide",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "planned Summer Olympics Munich massacre",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "disguised as female to compete for y at Summer Olympics",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "breast cancer",
    "of cancer",
    "cancer",
    "shot with pepper spray projectile by Boston Police",
    "killed by a head shot together with friend",
    "shot whilst covering the Oaxaca protests",
    "shot during attack on José Ramos Horta",
    "shot down during the Battle of Kyiv",
    "self inflicted gunshot to the head",
    "complications from gunshot wounds",
    "shot in the Rajnandgaon ambush",
    "shot by his Chief of Security",
    "suspected suicide by gunshot",
    "gunshot by Brandon McInerney",
    "shot during domestic dispute",
    "apparent suicide by gunshot",
    "possible suicide by gunshot",
    "shot by the Defense Forces",
    "shot by Special Task Force",
    "shot during prison escape",
    "complications of gunshot",
    "shot by record producer",
    "shot during carjacking",
    "shot by police officer",
    "injuries from gunshot",
    "shot in an army raid",
    "homicide by gunshot",
    "shot during robbery",
    "gunshot to the head",
    "shot by the Army in",
    "suicide by gunshot",
    "accidental gunshot",
    "shot while hunting",
    "shot dead in Basra",
    "shot in East Timor",
    "murder by gunshot",
    "shot in Mogadishu",
    "shot by soldiers",
    "shot by the IRA",
    "shot by robbers",
    "shot by police",
    "gunshot wounds",
    "shot to death",
    "gunshot wound",
    "shot times",
    "shot dead",
    "shot down",
    "gunshot",
    "shot",
    "complications of cerebral hemorrhage from traffic collision",
    "complications from a race collision during Hours of Le Mans",
    "complications from injuries sustained in traffic collision",
    "multiple organ failure following traffic collision",
    "complications from a collision during competition",
    "head injuries sustained in a traffic collision",
    "cardiac arrest due to collision with teammate",
    "head injuries sustained in a race collision",
    "blunt force neck injury from race collision",
    "head injury sustained in traffic collision",
    "injuries sustained in a traffic collision",
    "injuries received in a traffic collision",
    "injuries sustained in traffic collision",
    "cardiac arrest after on field collision",
    "complications from a traffic collision",
    "spinal injuries from traffic collision",
    "injuries sustained in a race collision",
    "complications after traffic collision",
    "complications from traffic collision",
    "motor vehicle collision with a moose",
    "head injuries from traffic collision",
    "injuries from a traffic collision",
    "head injuries from race collision",
    "injuries from traffic collision",
    "injuries sustained in collision",
    "vehicle pedestrian collision",
    "suicide by traffic collision",
    "collision during practice",
    "base jumping collision",
    "motorcycle collision",
    "dirt bike collision",
    "racetrack collision",
    "bobsleigh collision",
    "boat race collision",
    "traffic collision ·",
    "training collision",
    "wingsuit collision",
    "traffic collision",
    "tractor collision",
    "cycling collision",
    "vehicle collision",
    "bicycle collision",
    "balloon collision",
    "plane collision",
    "train collision",
    "stunt collision",
    "race collision",
    "boat collision",
    "ATV collision",
    "car collision",
    "cardiac arrest stemming from decompression sickness",
    "cardiac arrhythmia stemming from atherosclerosis",
    "cardiac arrest due to abdominal aortic aneurysm",
    "cardiac arrest due to collision with teammate",
    "possible cardiac arrest during Dakar Rally",
    "brain hemorrhage following cardiac arrest",
    "cardiac arrest after a spinal cord injury",
    "complications following cardiac surgery",
    "cardiac arrest from respiratory failure",
    "cardiac arrest after on field collision",
    "complications from a cardiac condition",
    "complications following cardiac arrest",
    "cardiac arrest following car accident",
    "complications of a cardiac condition",
    "complications from cardiac surgery",
    "cardiac arrest as a result of COPD",
    "complications from cardiac arrest",
    "complications of cardiac surgery",
    "complications of cardiac arrest",
    "cardiac arrest during surgery",
    "cardiac arrest due to sepsis",
    "cardiac arrest from shooting",
    "pulmonary cardiac arrest",
    "cardiac complications",
    "cardiac arrest due to",
    "cardiac amyloidosis",
    "cardiac dysfunction",
    "cardiac dysrhythmia",
    "cardiac arrhythmia",
    "cardiac arrest and",
    "cardiac arrythmia",
    "cardiac aneurysm",
    "cardiac problems",
    "cardiac failure",
    "cardiac ailment",
    "cardiac illness",
    "cardiac arrest",
    "chronic kidney disease caused by type diabetes",
    "of natural causes after a lengthy illness",
    "respiratory failure caused by brain tumor",
    "discovered cause of sickle cell anemia",
    "liver failure caused by Hepatitis C",
    "brain death caused by knockout",
    "complications caused by AIDS",
    "apparent natural causes",
    "AIDS related causes",
    "natural causes",
    "unknown causes",
    "body found on this date after suicide by carbon monoxide poisoning",
    "possible suicide by overdose of prescription painkillers",
    "and became the West first woman suicide bomber",
    "fall from height ruled a suicide by the police",
    "suspected suicide by carbon monoxide poisoning",
    "suicide by overdose of prescription medication",
    "apparent suicide by carbon monoxide poisoning",
    "suicide by self inflicted blunt force trauma",
    "suspected suicide by fall from a building",
    "suicide by carbon monoxide inhalation",
    "suicide by carbon monoxide poisoning",
    "intercepted suicide bomber at school",
    "assisted suicide by lethal injection",
    "apparent suicide in front of a train",
    "suicide by blunt force head injury",
    "killed by suicide bomb in Northern",
    "suicide by inert gas asphyxiation",
    "suicide by jumping from building",
    "apparent suicide by dehydration",
    "injuries from a suicide attempt",
    "suicide by barbiturate overdose",
    "suicide by self defenestration",
    "suicide by jumping from bridge",
    "suicide by jumping from cliff",
    "suspected suicide by hanging",
    "suspected suicide by gunshot",
    "suicide by alcohol poisoning",
    "suicide by helium inhalation",
    "suicide by traffic collision",
    "suicide by cyanide poisoning",
    "suicide by grenade explosion",
    "apparent suicide by stabbing",
    "apparent suicide by hanging",
    "apparent suicide by gunshot",
    "apparent suicide by jumping",
    "suspected suicide by poison",
    "possible suicide by gunshot",
    "physician assisted suicide",
    "suicide by self immolation",
    "suicide prior to execution",
    "suicide by throat cutting",
    "victim of suicide bombing",
    "suicide by defenestration",
    "suicide by drug overdose",
    "charcoal burning suicide",
    "suicide by strangulation",
    "suicide by wrist cutting",
    "suicide by asphyxiation",
    "suicide by train impact",
    "suicide by hand grenade",
    "suicide by poisoning",
    "suicide by car crash",
    "suicide by drowning",
    "suicide by overdose",
    "suicide bomb attack",
    "suicide by stabbing",
    "suicide bomb victim",
    "suicide by hanging",
    "suicide by gunshot",
    "suicide by jumping",
    "suicide by fasting",
    "officially suicide",
    "suicide by alcohol",
    "suspected suicide",
    "assisted suicide",
    "apparent suicide",
    "suicide by train",
    "suicide by pilot",
    "possible suicide",
    "suicide bombing",
    "suicide by fire",
    "suicide by drug",
    "suicide attack",
    "murder suicide",
    "suicide by",
    "Complications of liver disease",
    "hypertensive atherosclerotic cardiovascular disease",
    "complications of a chronic neurological disease",
    "complications from Charcot Marie Tooth disease",
    "complications from peripheral vascular disease",
    "chronic kidney disease caused by type diabetes",
    "complications of Charcot Marie Tooth disease",
    "complications from polycystic kidney disease",
    "complications related to Alzheimer disease",
    "complications from a neuromuscular disease",
    "complications from cardiovascular disease",
    "lung complications from Alzheimer disease",
    "complications from motor neurone disease",
    "active euthanasia for pulmonary disease",
    "complications related to kidney disease",
    "from complications of Parkinson disease",
    "atherosclerotic cardiovascular disease",
    "complications from Parkinson’s disease",
    "complications from respiratory disease",
    "euthanization following a lung disease",
    "chronic obstructive pulmonary disease",
    "liver disease complicated by diabetes",
    "complications from Parkinsons disease",
    "complications from Alzheimer disease",
    "complications from Parkinson disease",
    "euthanized following adrenal disease",
    "complications from Lewy body disease",
    "f complications of Alzheimer disease",
    "complications from pulmonary disease",
    "complications of respiratory disease",
    "complications of a vascular disease",
    "complications of Parkinson disease",
    "complications of Alzheimer disease",
    "complications due to heart disease",
    "complications of Lewy body disease",
    "complications from kidney disease",
    "complication of Parkinson disease",
    "complications from heart disease",
    "complications from liver disease",
    "pulmonary veno occlusive disease",
    "combination of Parkinson disease",
    "complications from lung disease",
    "complications of kidney disease",
    "gastroesophageal reflux disease",
    "complications of heart disease",
    "arteriosclerotic heart disease",
    "complications of Crohn disease",
    "complications of liver disease",
    "cardiovascular renal disease",
    "hypertensive heart disease",
    "hypertrophic heart disease",
    "degenerative brain disease",
    "Creutzfeldt Jakob disease",
    "interstitial lung disease",
    "neurodegenerative disease",
    "cerebral vascular disease",
    "inflammatory lung disease",
    "acute respiratory disease",
    "chronic pulmonary disease",
    "degenerative lung disease",
    "congenital heart disease",
    "gastrointestinal disease",
    "cerebrovascular disease",
    "coronary artery disease",
    "cardiopulmonary disease",
    "ischaemic heart disease",
    "Erdheim Chester disease",
    "discovered Lyme disease",
    "suspected heart disease",
    "cardiovascular disease",
    "chronic kidney disease",
    "coronary heart disease",
    "ischemic heart disease",
    "motor neurone disease",
    "legionnaires' disease",
    "chronic liver disease",
    "meningococcal disease",
    "motor neuron disease",
    "neurological disease",
    "degenerative disease",
    "Lou Gehrig’s disease",
    "respiratory disease",
    "ebola virus disease",
    "Parkinson’s disease",
    "Ebola virus disease",
    "undisclosed disease",
    "Legionnaire disease",
    "sickle cell disease",
    "Lou Gehrig disease",
    "Huntington disease",
    "infectious disease",
    "autoimmune disease",
    "intestinal disease",
    "Alzheimer disease",
    "Parkinson disease",
    "pulmonary disease",
    "alzheimer disease",
    "vascular disease",
    "coronary disease",
    "Addison disease",
    "kidney disease",
    "muscle disease",
    "heart disease",
    "liver disease",
    "blood disease",
    "renal disease",
    "Crohn disease",
    "nerve disease",
    "brain disease",
    "lung disease",
    "Pick disease",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [21]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [22]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 17342 values in cause_of_death column.

CPU times: total: 3min 24s
Wall time: 3min 24s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [23]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    88099
2     9704
3      195
0       43
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [24]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [267]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [262]:
# # Create specific_ro


<IPython.core.display.Javascript object>

In [268]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [269]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "United Airlines Flight crash survivor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [270]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "evidence led to ban on sex segregated classified advertising",
    "leading authority on the Constitution of",
    "Acting Prime Minister of the Democratic Republic of the",
    "Prime Minister of the Socialist Federal Republic of",
    "wife of the Prime Minister Konstantinos Mitsotakis",
    "widow of former Prime Minister Sir William McMahon",
    "Principal Private Secretary to the Prime Minister",
    "Press Secretary to Prime Minister Brian Mulroney",
    "last Prime Minister of the People Republic of",
    "son of former Prime Minister Lester B Pearson",
    "brother of former Prime Minister John Major",
    "wife of Prime Minister Alexander Bustamante",
    "Secretary of the Prime Minister Department",
    "Prime Minister of the Federal Republic of",
    "widow of former Prime Minister Takeo Miki",
    "wife of Prime Minister Eddie Fenech Adami",
    "Prime Minister of the Socialist Republic",
    "press secretary for Prime Minister Petre",
    "first wife of Prime Minister Bob Hawke",
    "Chief of Staff to the Prime Minister",
    "Prime Minister of Northern from to",
    "Prime Minister of Czechoslovakia",
    "Spouse of the Prime Minister of",
    "spokesperson for Prime Minister",
    "former Labour Prime Minister of",
    "Prime Minister of Lower Saxony",
    "Prime Minister of the Antilles",
    "Spouse of the Prime Minister",
    "former Prime Minister of the",
    "Prime Minister of Tanganyika",
    "Prime Minister of Macedonia",
    "Prime Minister of Northern",
    "Prime Minister of Bunyoro",
    "Prime Minister of Krajina",
    "former Prime Minister of",
    "Prime Minister of Crimea",
    "Prime Minister of the s",
    "twice Prime Minister of",
    "Prime Minister of South",
    "Prime Minister of Azad",
    "Deputy Prime Minister",
    "Acting Prime Minister",
    "acting Prime Minister",
    "st Prime Minister of",
    "th Prime Minister of",
    "Prime Minister of",
    "argued that social problems contributed to mental illness",
    "former executive director of the Republican National Committee",
    "executive director of the United Nations Population Fund",
    "executive mayor of the George Local Municipality",
    "chief executive of the Medical Research Council",
    "county executive for Prince George County",
    "chief executive of Te Rūnanga o Ngāi Tahu",
    "executive director of the NAACP",
    "executive director of the NHCP",
    "chief executive councillor of",
    "executive director of NFLPA",
    "non profit executive",
    "opposition figure who ran unsuccessfully for President losing to Bakili Muluzi",
    "former President of the Federal Republic",
    "President of the Senate during the Jamaat al Muslimeen coup attempt",
    "President of the International Tribunal for the Law of the Sea",
    "Secretary of Commerce under President Lyndon B Johnson from to",
    "President of the Democratic National Construction Association",
    "President of the Southern Christian Leadership Conference",
    "President of the Regional Council of Languedoc Roussillon",
    "first President of the Grandmothers of the Plaza de Mayo",
    "Assistant Labor Secretary under President Richard Nixon",
    "first President of the Federated States of Micronesia",
    "Vice President of the International Court of Justice",
    "President of Himachal Pradesh Bharatiya Janata Party",
    "mentioned in President Obama election victory speech",
    "President of the Public Establishment of the Palace",
    "President of the Legislative Assembly of Pernambuco",
    "President pro tempore of the North Carolina Senate",
    "President of who murdered thousands of adversaries",
    "President of the Houthi Supreme Political Council",
    "Minister President of the Brussels Capital Region",
    "President of the United Nations General Assembly",
    "President of the United Nations Security Council",
    "President of the National Organization for Women",
    "President of the International Alliance of Women",
    "President of the Union of Democratic Mineworkers",
    "President of the New South Industrial Commission",
    "former Vice President of the People Republic of",
    "President of the New South Legislative Council",
    "President of the Provincial Deputation of Lugo",
    "last President of the People Republic in exile",
    "first female President of the Mescalero Apache",
    "President of University of Student Federation",
    "President of the Federal Constitutional Court",
    "President of the Regional Council of Brittany",
    "President of the Regional Junta of the Azores",
    "as wife of President Frederik Willem de Klerk",
    "President of the Alaska Federation of Natives",
    "President of the Parliament of the Community",
    "member of the Presidential Advisory Council",
    "President pro tempore of the Montana Senate",
    "mother of ex President Luis Alberto Lacalle",
    "Vice President of the National Assembly of",
    "President of the Inter Commission of Women",
    "wife of former President Ibrahim Babangida",
    "President of Akhil Bharatiya Gorkha League",
    "second wife of President Fulgencio Batista",
    "widow of former Vice President Spiro Agnew",
    "President of the South Legislative Council",
    "third party candidate for President of the",
    "Vice President of the Chamber of Deputies",
    "President of the Presidium of the Supreme",
    "President of the Armed Forces of Cabinda",
    "President of the Islamic Salvation Front",
    "President of the Parliament of Cantabria",
    "Vice President of the Court of Appeal of",
    "President of the Executive Council of SR",
    "Minister President of Baden Württemberg",
    "Chief of Staff to President Alpha Condé",
    "Presidential Medal of Freedom recipient",
    "Vice President of the National Assembly",
    "Prohibition Party nominee for President",
    "President of the Bharatiya Janata Party",
    "President of the Women Organization of",
    "President of the National Action Party",
    "President of the Trades Union Congress",
    "President of the Court of Human Rights",
    "President of the Government of Navarre",
    "President of the Political Association",
    "Vice President of the Council of State",
    "President of the Republic of Macedonia",
    "former Counsel under Presidents Carter",
    "first democratically elected President",
    "President of the Congress of Deputies",
    "President of Assembly of the Republic",
    "President of the Constitutional Court",
    "President of the Social Liberal Party",
    "President of the Government of Aragon",
    "President of the Constituent Assembly",
    "Acting President of the Presidency of",
    "President of the Supreme People Court",
    "Vice President of Zhejiang University",
    "President of the Province of Grosseto",
    "President of Broadcasting Corporation",
    "President of the Chamber of Deputies",
    "Principal Secretary to the President",
    "President of the United Steelworkers",
    "President of the Federal Republic of",
    "President of the National Parliament",
    "President of the Province of Naples",
    "President of the University of Guam",
    "Lord President of the Supreme Court",
    "President pro tempore of the Senate",
    "President of Parliamentary Party of",
    "Minister President of the Community",
    "widow of President Salvador Allende",
    "Minister President of Saxony Anhalt",
    "President of University of Southern",
    "President of the National Assembly",
    "President of the National Congress",
    "President of the Province of Udine",
    "Secretary General of the President",
    "President of the courts of appeals",
    "President of the Regional Council",
    "President of the Council of State",
    "President of Province of Alicante",
    "President of the National Council",
    "President of Bharatiya Jana Sangh",
    "President of the Examination Yuan",
    "advisor to President Jimmy Carter",
    "first President of the Federation",
    "President of the Legislative Yuan",
    "President of the Liberal Party of",
    "President of the Mescalero Apache",
    "President of the Seneca Nation of",
    "brother of President Hamid Karzai",
    "th President of the Presidency of",
    "Executive Vice President of Yukos",
    "President of the Xunta of Galicia",
    "spokesman of the President Office",
    "President of the Illinois Senate",
    "President of the Bar Association",
    "President of Deutsche Bundesbank",
    "President of the Udmurt Republic",
    "President of the Family Division",
    "President of the Supreme Council",
    "separatist President of Abkhazia",
    "President of the Court of Appeal",
    "President of the Central Bank of",
    "President of Chamber of Deputies",
    "President of the Pan Parliament",
    "candidate for President of a in",
    "first President of Azad Kashmir",
    "President of the People Chamber",
    "two time Presidential candidate",
    "President of the Navajo Nation",
    "President of National Assembly",
    "President of the Liberal Party",
    "President of the Free Alliance",
    "President of the Federation of",
    "President of the Supreme Court",
    "President of PEN International",
    "President of the Confederation",
    "President of the Victorian RSL",
    "President of Valencian Courts",
    "President of National Council",
    "President of the People Party",
    "President of the Maine Senate",
    "President of the State Senate",
    "President of Jamaat e Islami",
    "President of the Republic of",
    "President of the Cook County",
    "candidate for Vice President",
    "advisor to President Kennedy",
    "President of Bar Association",
    "President of the Parliament",
    "President of the New Senate",
    "President of the Commission",
    "President of Bophuthatswana",
    "President of the Royal Mint",
    "Borough President of Queens",
    "President of the Bundesbank",
    "fourth President of from to",
    "President of Pan Parliament",
    "President of Czechoslovakia",
    "President of the Bundestag",
    "President of the Authority",
    "Vice President of the USSR",
    "President of Oromia Region",
    "President of the Senate of",
    "Vice President of FRELIMO",
    "former President of North",
    "President of the Republic",
    "th Vice President of the",
    "eighth Vice President of",
    "President of Upper Volta",
    "President of the Senate",
    "President of Parliament",
    "President of Polynesia",
    "first female President",
    "President of y from to",
    "Second Vice President",
    "President of Zanzibar",
    "President of Puntland",
    "President of Hadassah",
    "former President of y",
    "de facto President of",
    "President of Chechnya",
    "President of Sardinia",
    "President of Madeira",
    "President of Krajina",
    "President of Galicia",
    "Third Vice President",
    "President of MCA Inc",
    "President of the NPU",
    "first President of a",
    "Vice State President",
    "first Vice President",
    "Federal President of",
    "President of the ZOA",
    "President of the NRC",
    "President of the PCA",
    "President of Sicily",
    "President of West y",
    "Temporary President",
    "former President of",
    "President of North",
    "President of SCBAP",
    "President of Kasaï",
    "President of since",
    "President of South",
    "President of USDAW",
    "Vice President of",
    "rd President of a",
    "st President of s",
    "Interim President",
    "Acting President",
    "acting President",
    "Senate President",
    "President of RAI",
    "nd President of",
    "Arunachal Pradesh MLA for Changlang North",
    "Columbia MLA for Vancouver Point Grey",
    "Prince Edward Island MLA for rd Kings",
    "Andhra Pradesh MLA for Chintalapudi",
    "MLA for Rockwood Iberville Lakeside",
    "Alberta MLA for Edmonton Mill Woods",
    "MLA for Dalhousie Restigouche East",
    "MLA from the Northwest Territories",
    "MLA for Vancouver Little Mountain",
    "Andhra Pradesh MLA for Allagadda",
    "Madhya Pradesh MLA for Raghogarh",
    "Alberta MLA for Calgary Glenmore",
    "Alberta MLA for Lethbridge West",
    "Madhya Pradesh MLA for Kasrawad",
    "Victorian MLA for Ballarat West",
    "Queensland MLA for Ipswich West",
    "Alberta MLA for Calgary Currie",
    "Columbia MLA for Prince Rupert",
    "Queensland MLA for Port Curtis",
    "Yukon MLA for McIntyre Takhini",
    "MLA for Edmonton Sherwood Park",
    "Nova Scotia MLA for West Kings",
    "Uttar Pradesh MLA for Deoband",
    "Manitoba MLA for Rupertsland",
    "Punjab MLA for Amritsar West",
    "Chhattisgarh MLA for Kharsia",
    "Uttar Pradesh MLA for Bilari",
    "MLA for Boundary Similkameen",
    "Victoria MLA for Mooroolbark",
    "Odisha MLA for Kissan Nagar",
    "MLA for Saskatoon Northwest",
    "West Bengal MLA for Amdanga",
    "Columbia MLA for Mackenzie",
    "Alberta MLA for Clover Bar",
    "Western MLA for Swan Hills",
    "MLA for Fundy River Valley",
    "Northwest Territories MLA",
    "Manitoba MLA for Kildonan",
    "MLA for Tuensang Sadar II",
    "MLA for Whitehorse Centre",
    "MLA for South Peace River",
    "Nadu MLA for Kadayanallur",
    "Prince Edward Island MLA",
    "Nadu MLA for Tiruchendur",
    "MLA for East Londonderry",
    "Columbia MLA for Saanich",
    "four time Karnataka MLA",
    "Kerala MLA for Kuttanad",
    "Queensland MLA for Cook",
    "New South MLA for Bulli",
    "MLA for Dartmouth South",
    "Northern Territory MLA",
    "MLA for Calgary Currie",
    "MLA for Rossland Trail",
    "MLA for Nellore Rural",
    "Arunachal Pradesh MLA",
    "Sindh MLA for Karachi",
    "MLA for Okanagan West",
    "Sarawak MLA for Krian",
    "Himachal Pradesh MLA",
    "MLA for Antrim North",
    "Gilgit Baltistan MLA",
    "MLA for Pandavapura",
    "MLA for Cape Centre",
    "six time Odisha MLA",
    "MLA for Alipurduars",
    "Nova Scotia MLA for",
    "MLA for Calgary Bow",
    "Andhra Pradesh MLA",
    "Madhya Pradesh MLA",
    "Rio de Janeiro MLA",
    "Santa Catarina MLA",
    "MLA for Surat East",
    "MLA for Memramcook",
    "MLA for Upper Bann",
    "MLA for North Down",
    "Uttar Pradesh MLA",
    "New Brunswick MLA",
    "Malacca State MLA",
    "MLA for Nathdwara",
    "NSW MLA for Dubbo",
    "MLA for Inverness",
    "MLA for St Albert",
    "MLA for Fermanagh",
    "Saskatchewan MLA",
    "Chhattisgarh MLA",
    "MLA for Waverley",
    "Minas Gerais MLA",
    "Maharashtra MLA",
    "West Bengal MLA",
    "Nova Scotia MLA",
    "Uttarakhand MLA",
    "MLA for Noorpur",
    "Mato Grosso MLA",
    "Perak State MLA",
    "MLA of Columbia",
    "MLA for Burrows",
    "MLA for Palghar",
    "MLA for Caracas",
    "Queensland MLA",
    "Puducherry MLA",
    "MLA of Alberta",
    "MLA for Altona",
    "Karnataka MLA",
    "Victorian MLA",
    "Rajasthan MLA",
    "São Paulo MLA",
    "New South MLA",
    "Meghalaya MLA",
    "Telangana MLA",
    "Jharkhand MLA",
    "Columbia MLA",
    "Manitoba MLA",
    "Selangor MLA",
    "Maranhão MLA",
    "Nagaland MLA",
    "Kelantan MLA",
    "Alberta MLA",
    "Haryana MLA",
    "Gujarat MLA",
    "Western MLA",
    "Paraíba MLA",
    "Sarawak MLA",
    "Kerala MLA",
    "Odisha MLA",
    "Paraná MLA",
    "Punjab MLA",
    "Bihar MLA",
    "Delhi MLA",
    "Assam MLA",
    "Yukon MLA",
    "Piauí MLA",
    "Johor MLA",
    "Perak MLA",
    "Goiás MLA",
    "Nadu MLA",
    "Goa MLA",
    "ACT MLA",
    "WA MLA",
    "AP MLA",
    "four time MLA",
    "MLA",
    "State Senator for Connecticut th District",
    "former Democratic Senator from Minnesota",
    "former Republican Senator from Maryland",
    "former Republican Senator for Nevada",
    "former North Carolina state Senator",
    "Senator of the College of Justice",
    "Democratic Senator from Wisconsin",
    "Senator from Illinois from to",
    "Senator for Dublin University",
    "North Carolina State Senator",
    "Senator for Loire Atlantique",
    "Senator for Alpes Maritimes",
    "Senator from Massachusetts",
    "former Labor Party Senator",
    "Senator from New Hampshire",
    "Senator for Borno Central",
    "Senator from Haute Savoie",
    "former Democratic Senator",
    "Senator for New Hampshire",
    "Washington State Senator",
    "Senator for Kaduna South",
    "Senator for Kaduna State",
    "Tennessee State Senator",
    "Minnesota State Senator",
    "Republican Senator from",
    "Senator for Nova Scotia",
    "Wisconsin State Senator",
    "Senator from Tennessee",
    "Senator for Queensland",
    "Nebraska State Senator",
    "Oklahoma State Senator",
    "served as Senator from",
    "Senator for Abia North",
    "Missouri State Senator",
    "Maryland State Senator",
    "Michigan State Senator",
    "Senator for New South",
    "Senator from Illinois",
    "Arizona State Senator",
    "Indiana State Senator",
    "New Hampshire Senator",
    "Senator from Maryland",
    "Vermont State Senator",
    "Senator from Alabama",
    "Senator for Victoria",
    "Senator from Montana",
    "Senator for Manitoba",
    "Senator for Rondônia",
    "Senator from Wyoming",
    "Oregon State Senator",
    "Nevada State Senator",
    "Senator for Western",
    "Senator from Kansas",
    "Senator from Hawaii",
    "Senator from Alaska",
    "Utah State Senator",
    "Iowa State Senator",
    "Senator from Ohio",
    "Senator from Utah",
    "Senator for Osun",
    "former Senator",
    "State Senator",
    "Senator from",
    "Senator for",
    "New Senator",
    "Ambassador to the United Nations Commission on the Status of Women",
    "Ambassador of the Arab League to the United Nations",
    "Ambassador to the United Nations for",
    "Ambassador to the United Nations",
    "editor in chief of Ambassador to",
    "served as Ambassador to from to",
    "Ambassador to the Holy See",
    "Ambassador to the Republic",
    "Ambassador to Upper Volta",
    "Ambassador to the UN and",
    "Ambassador to the USSR",
    "Ambassador to the and",
    "former Ambassador to",
    "Deputy Ambassador to",
    "Ambassador to West y",
    "Ambassador to East y",
    "Ambassador to Arabia",
    "Ambassador of to the",
    "Ambassador to istan",
    "Ambassador to Union",
    "Ambassador at large",
    "Ambassador to the",
    "Ambassador to and",
    "Ambassador to y",
    "Ambassador to s",
    "Ambassador to",
    "known for playing a major role in Secretary of State Henry Kissinger shuttle diplomacy",
    "known for Munsinger Affair",
    "Director of the Office of Nuclear Reactor Regulation",
    "segregationist politician",
    "Forza Europa politician",
    "barrister politician",
    "and politician",
    "politician",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "writer",
    "executive director of the Philharmonic",
    "executive producer of the soap opera",
    "chief executive of VF Corporation",
    "film & TV actor; Motown executive",
    "NFLPA executive director",
    "studio executive",
    "executive at CNN",
    "media executive",
    "influential in language poetry movement",
    'called the "People poet of Dagestan"',
    "poet laureate of Iowa",
    "Iowa poet laureate",
    "poet",
    "known as a leading journalist opposed to",
    "first woman to work as a photojournalist",
    "adventure photojournalist",
    "and newspaper journalist",
    "television journalist",
    "cultural journalist",
    "journalist artist",
    "music journalist",
    "radio journalist",
    "photojournalist",
    "and journalist",
    "by singer Bertrand Cantat",
    "singer in the Buena Vista Social Club",
    "ex husband of singer Tina Turner",
    "backup singer for Elvis Presley",
    "mother of singer Stevie Wonder",
    "playback singer",
    "country singer",
    "radio singer",
    "opera singer",
    "cuplé singer",
    "folk singer",
    "and singer",
    "folksinger",
    "widow of composer Sir William Walton",
    "composer of the national anthem",
    "and film composer",
    "music composer",
    "composer",
    "benefactor of the Mondavi Center",
    "first black actor to appear on",
    "Academy Award nominated actor",
    "son of actors Margo",
    "theatre actor",
    "voice actor",
    "comic actor",
    "film actor",
    "and actor",
    "playwright",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "son of Prime Minister Pierre Trudeau",
    "executive chairman of the Asian Tour",
    "executive director of the MLBPA",
    "minor league baseball executive",
    "President of the Green Bay Packers",
    "President of FIBA",
    "owner of the Ottawa Senators",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "worked on leukemia treatments",
    "first chief executive of Amgen",
    "health supplement executive",
    "natural resource executive",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "chief executive of the Shipowners' Association",
    "executive director of the Business Roundtable",
    "chief executive of the Eastman Kodak Company",
    "and the chief executive of Aerospace",
    "manufacturing executive",
    "executive with Converse",
    "construction executive",
    "chief executive of IBM",
    "advertising executive",
    "financial executive",
    "business executive",
    "airline executive",
    "casino executive",
    "steel executive",
    "auto executive",
    "oil executive",
    "President of Amtrak",
    "founder of Tanger Factory Outlet Centers",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "leading authority on the art of medieval",
    "a leading authority on the Union and",
    "authority on secret societies",
    "authority on Native culture",
    "authority on literature",
    "authority on art",
    "executive dean of Nelson Mandela University",
    "executive director of",
    "President of McMaster University",
    "President of Wesleyan University",
    "President of Longwood University",
    "President of Columbia University",
    "President of Lingnan University",
    "President of Peking University",
    "President of Tehran University",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "deputy political commissar of Shenyang Military Region",
    "deputy inspector general of the National Police",
    "deputy commander of RAF Strike Command",
    "deputy commander of the PLA Air Force",
    "deputy chief of staff of the Army",
    "deputy director of Shin Bet",
    "deputy chief of staff",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "first Native to become a general authority of The Church of Jesus Christ of Latter day Saints",
    "general authority of The Church of Jesus Christ of Latter day Saints",
    "general authority of the LDS Church",
    "chief executive of Concern Worldwide",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "first executive director of Gay Men Health Crisis",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "killer of dissident journalist Henry Liu",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "after killing his old autistic son",
    "of standing trial for war crimes",
    "whose accident photos were released onto internet",
    "world record holder for longest fall",
    "USAID contractor kidnapped by al Qaeda",
    "United Airlines Flight crash survivor",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "co owned by Prime Minister Jacinda Ardern",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "suicide",
    "chronic lymphocytic leukemia",
    "complications from leukemia",
    "chronic lymphomic leukemia",
    "acute myelogenous leukemia",
    "complications of leukemia",
    "acute myeloid leukemia",
    "lymphoblastic leukemia",
    "acute myeloma leukemia",
    "lymphocytic leukemia",
    "plasma cell leukemia",
    "myelogenous leukemia",
    "lymphoid leukemia",
    "myeloid leukemia",
    "chronic leukemia",
    "acute leukemia",
    "complications from respiratory illness",
    "complications from a prolonged illness",
    "complications of respiratory illness",
    "infection from a chronic illness",
    "respiratory related illness",
    "following a long illness",
    "after prolonged illness",
    "after a lengthy illness",
    "kidney related illness",
    "after a short illness",
    "age related illnesses",
    "liver related illness",
    "after a brief illness",
    "after serious illness",
    "AIDS related illness",
    "after a long illness",
    "drug related illness",
    "degenerative illness",
    "after short illness",
    "respiratory illness",
    "after brief illness",
    "HIV related illness",
    "after long illness",
    "pulmonary illness",
    "prolonged illness",
    "lingering illness",
    "after an illness",
    "stomach illness",
    "kidney illness",
    "short illness",
    "heart illness",
    "liver illness",
    "brief illness",
    "after illness",
    "long illness",
    "lung illness",
    "malignant hypertension leading to kidney failure",
    "pulmonary failure as a complication of diabetes",
    "massive organ failure after being imprisoned",
    "respiratory failure due to diverticulitis",
    "dementia with complicating kidney failure",
    "liver failure as a result of hepatitis C",
    "complications from respiratory failure",
    "kidney failure after wrestling injury",
    "liver failure from substance abuse",
    "complications from kidney failure",
    "organ failure due to dengue fever",
    "liver failure after hunger strike",
    "complications from renal failure",
    "complications from liver failure",
    "euthanized after kidney failure",
    "complications of kidney failure",
    "complications of organ failure",
    "chronic respiratory failure",
    "hypoxic respiratory failure",
    "power failure to iron lung",
    "cardio respiratory failure",
    "cardiorespiratory failure",
    "acute respiratory failure",
    "left ventricular failure",
    "suspected kidney failure",
    "cardiopulmonary failure",
    "multiple organ failure",
    "respiratory failure",
    "bone marrow failure",
    "acute renal failure",
    "pulmonary failure",
    "failure to thrive",
    "bronchial failure",
    "kidney failure",
    "renal failure",
    "liver failure",
    "organ failure",
    "lung failure",
    "complications from injuries sustained in traffic accident",
    "from injuries sustained in a motorcycle accident",
    "head injuries resulting from domestic accident",
    "fatal neck injury sustained in an accident in",
    "injuries sustained in campdrafting accident",
    "injuries sustained in a motorcycle accident",
    "complications following a traffic accident",
    "injuries sustained in skydiving accident",
    "killed in an accident during a tire test",
    "injuries sustained in a jetboat accident",
    "complications following skiing accident",
    "diffuse axonal injury from car accident",
    "complications after a bicycle accident",
    "injuries from a motor scooter accident",
    "injuries sustained in bicycle accident",
    "accidental prescription drug overdose",
    "accidental combined drug intoxication",
    "injuries sustained in skiing accident",
    "injuries sustained in a car accident",
    "injuries sustained from car accident",
    "accidental overdose of pain killers",
    "accidental overdose of hydrocodone",
    "complications from skiing accident",
    "consequences of a vehicle accident",
    "accidental overdose of barbiturate",
    "injuries sustained in an accident",
    "accidental hydromorphone overdose",
    "complications from a car accident",
    "apparent accidental drug overdose",
    "head injury from bicycle accident",
    "complications after car accident",
    "complications from ATV accident",
    "accidental overdose of alcohol",
    "injuries sustained in accident",
    "accidental medication overdose",
    "injuries from traffic accident",
    "complications of car accident",
    "injuries from accidental fall",
    "accidental alcohol poisoning",
    "accidental fentanyl overdose",
    "accidental cocaine overdose",
    "accidental heroin overdose",
    "motorcycle stunt accident",
    "Formula Two race accident",
    "accidental drug overdose",
    "ski accident in the Alps",
    "accident during practice",
    "hot air balloon accident",
    "mountaineering accident",
    "accidental asphyxiation",
    "rock climbing accident",
    "swimming pool accident",
    "accidental head trauma",
    "conveyor belt accident",
    "drunk driving accident",
    "accidental suffocation",
    "BASE jumping accident",
    "scuba diving accident",
    "hang gliding accident",
    "falling accident on K",
    "kiteboarding accident",
    "team chasing accident",
    "accidental impalement",
    "road traffic accident",
    "work related accident",
    "base jumping accident",
    "free diving accident",
    "paragliding accident",
    "agriculture accident",
    "motorcycle accident",
    "automobile accident",
    "accidental overdose",
    "accidental drowning",
    "industrial accident",
    "accidental fentanyl",
    "helicopter accident",
    "snowmobile accident",
    "gyrocopter accident",
    "single car accident",
    "automotive accident",
    "skydiving accident",
    "accidental choking",
    "workplace accident",
    "submarine accident",
    "vehicular accident",
    "wrestling accident",
    "rally car accident",
    "climbing accident",
    "domestic accident",
    "aviation accident",
    "drowning accident",
    "swimming accident",
    "railroad accident",
    "air show accident",
    "traffic accident",
    "boating accident",
    "bicycle accident",
    "cycling accident",
    "farming accident",
    "hunting accident",
    "tractor accident",
    "jet ski accident",
    "fishing accident",
    "paddock accident",
    "choking accident",
    "cricket accident",
    "surfing accident",
    "gliding accident",
    "jumping accident",
    "scooter accident",
    "racing accident",
    "accidental fall",
    "skiing accident",
    "hiking accident",
    "diving accident",
    "marine accident",
    "accidental drug",
    "motor accident",
    "crane accident",
    "road accident",
    "race accident",
    "auto accident",
    "fire accident",
    "car accident",
    "ATV accident",
    "bus accident",
    "gun accident",
    "murdered while attempting to arrest an Islamic terrorist",
    "murdered during the Melbourne gangland killings",
    "murdered by the United Self Defense Forces of",
    "murdered by the Loyalist Volunteer Force",
    "murdered by serial killer Andrew Cunanan",
    "murdered during a voyage on a catamaran",
    "murdered while working at McDonald",
    "murdered along with his family",
    "murdered by a Hutu rebel group",
    "murdered by organized crime",
    "murdered in line of duty",
    "murdered during burglary",
    "murdered during robbery",
    "murdered in a shooting",
    "murdered in Jasenovac",
    "murdered around May",
    "murdered in prison",
    "murdered in Anapu",
    "murdered by a mob",
    "murdered abroad",
    "complications following neck injuries from a fall",
    "complications of a brain injury after a fall",
    "complications from surgery following a fall",
    "blunt force trauma after fall from vehicle",
    "complications of a head injury from fall",
    "brain injury from fall during beating",
    "complications of injuries from a fall",
    "brain injury from competition fall",
    "injuries sustained in racing fall",
    "surgical complications after fall",
    "cerebral hemorrhage after falling",
    "head injuries sustained in a fall",
    "fall when climbing Mount Everest",
    "cerebral hemorrhage from a fall",
    "head trauma from treadmill fall",
    "complications following a fall",
    "injuries sustained from a fall",
    "head injuries following a fall",
    "cerebral hemorrhage after fall",
    "fall from a piece of equipment",
    "neck injury from bicycle fall",
    "injuries from accidental fall",
    "injuries sustained in a fall",
    "injuries sustained from fall",
    "brain hemorrhage after fall",
    "fall during Paris Nice race",
    "complications after a fall",
    "fall from apartment window",
    "euthanized after race fall",
    "euthanised after race fall",
    "complications from a fall",
    "head injuries from a fall",
    "complications after fall",
    "broken neck after a fall",
    "crushed by falling horse",
    "brain injury from a fall",
    "complications of a fall",
    "complications from fall",
    "head injuries from fall",
    "struck by falling tree",
    "head injury from fall",
    "complications of fall",
    "falling accident on K",
    "injuries from a fall",
    "fall from a building",
    "trauma due to a fall",
    "fall on Nanga Parbat",
    "hit by falling tree",
    "fall while training",
    "fall from a bicycle",
    "fall from building",
    "injuries from fall",
    "fall from mountain",
    "injury from a fall",
    "fall from balcony",
    "rockclimbing fall",
    "fall from a cliff",
    "fall from window",
    "accidental fall",
    "fall from horse",
    "fall from cliff",
    "fall from roof",
    "fall from boat",
    "climbing fall",
    "falling tree",
    "fall at home",
    "rock fall",
    "lung infection while battling a bone lymphoma",
    "lymphoma due to chronic active EBV infection",
    "complications of non Hodgkin lymphoma",
    "complications from B cell lymphoma",
    "central nervous system lymphoma",
    "non Hodgkin follicular lymphoma",
    "chronic lymphocytic lymphoma",
    "T cell lymfoblastic lymphoma",
    "complications from lymphoma",
    "from lymphoma complications",
    "complications of lymphoma",
    "cutaneous B cell lymphoma",
    "T lymphoblastic lymphoma",
    "lymphoma of brain cells",
    "non Hodgkin’s lymphoma",
    "AIDS induced lymphoma",
    "non Hodgkin lymphoma",
    "mantle cell lymphoma",
    "Non hodgkin lymphoma",
    "Non Hodgkin lymphoma",
    "malignant lymphoma",
    "cerebral lymphoma",
    "Hodgkin lymphoma",
    "Burkitt lymphoma",
    "T cell lymphoma",
    "B cell lymphoma",
    "lymphoma",
    "stabbed in the chest by a stingray barb",
    "stabbed during Westminster attack",
    "stabbed to death by his son",
    "stabbed to death",
    "stabbed",
    "injuries sustained in the Lokomotiv Yaroslavl plane crash",
    "crash during race practice for the Superbike Championship",
    "drowned in helicopter crash while filming",
    "head injuries sustained in a car crash",
    "victim of the Martinsville plane crash",
    "injuries sustained in a plane crash",
    "injuries sustained in a car crash",
    "after crashing at Suzuka on April",
    "injuries sustained in race crash",
    "bicycle crash during the Tour de",
    "head injuries from a race crash",
    "plane crash of Swissair Flight",
    "brain injuries from race crash",
    "survivor of LaMia Flight crash",
    "victim of the City plane crash",
    "head injury from car crash",
    "injuries from a race crash",
    "crash during Daytona race",
    "injuries from a car crash",
    "car crash in Menlo Park",
    "Farnborough air crash",
    "crash during practice",
    "survivor of crash",
    "paragliding crash",
    "competition crash",
    "helicopter crash",
    "motorcycle crash",
    "spaceplane crash",
    "snowmobile crash",
    "automobile crash",
    "racetrack crash",
    "airplane crash",
    "wingsuit crash",
    "training crash",
    "aircraft crash",
    "bicycle crash",
    "airshow crash",
    "jet car crash",
    "glider crash",
    "rocket crash",
    "racing crash",
    "train crash",
    "rally crash",
    "race crash",
    "car crash",
    "air crash",
    "bus crash",
    "aircrash",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [271]:
# Hard-coding cause_of_death for entry for clarity of value
index = df[df["link"] == "https://en.wikipedia.org/wiki/Nikki_Catsouras"].index
df.loc[index, "cause_of_death"] = "car accident"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [272]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [273]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 19841 values in cause_of_death column.

CPU times: total: 5min 19s
Wall time: 5min 19s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [274]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    87836
2     9956
3      207
0       42
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [276]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [657]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [658]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "Minister" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [642]:
# df.loc[
#     [
#         index
#         for index in df[df["info_3_0"].notna()].index
#         if "member of the National Assembly" in df.loc[index, "info_3_0"]
#         and df.loc[index, "politics_govt_law"] == 0
#     ],
#     :,
# ]

<IPython.core.display.Javascript object>

In [659]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [660]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "Minister without portfolio"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [661]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "co author of the Constitution of the Islamic Republic of",
    "co author of the Short Doyle Mental Health Act",
    "Prime Minister",
    "Senator",
    "deputy director general of the Secretariat of the Pacific Community",
    "deputy chairman of the Karnataka Legislative Council",
    "deputy chair of the Republican State Committee of",
    "federal deputy in the LXI Legislature of Congress",
    "deputy president of the Supreme Court of Appeal",
    "deputy chairman of Auckland Regional Council",
    "deputy speaker of the Constituent Assembly",
    "deputy governor of the Central Bank of",
    "deputy minister of internal affairs",
    "deputy minister of higher education",
    "deputy minister to the president",
    "deputy governor of Kaduna State",
    "first deputy prime minister",
    "deputy leader of Venstre",
    "deputy attorney general",
    "deputy prime minister",
    "deputy First Minister",
    "deputy premier of",
    "three term deputy",
    "four time deputy",
    "deputy minister",
    "deputy premier",
    "deputy of",
    "Deputy director general of the Civil Affairs Department of Hubei Province",
    "deputy director general of the Secretariat of the Pacific Community",
    "director of the Administrative Office of the Courts",
    "secretary director of Board of Film Classification",
    "director of the Military Intelligence Directorate",
    "director general of the Department of Scientific",
    "director of the Office of Legislative Services",
    "director of the State Taxation Administration",
    "director of the Industrial Areas Foundation",
    "director of National Bureau of Statistics",
    "director of the Government Pension Fund",
    "director of Centers for Disease Control",
    "director of Rainforest Action Network",
    "Liberal Party communications director",
    "director of the Office of Management",
    "director of Office of Net Assessment",
    "director of the Orangi Pilot Project",
    "director of the Max Planck Society",
    "director of the Patent",
    "director of nuclear",
    "director of USIA",
    "budget director",
    "Deputy Political Commissar of COSTIND",
    "Deputy Secretary of Veterans Affairs",
    "chairman of the Assembly of Experts",
    "member of the Assembly of Experts",
    "member of the House of Assembly",
    "member of the National Assembly",
    "senator\xa0 of the Parliament of La Rioja",
    "senator from North Carolina",
    "Minnesota state senator",
    "Tennessee state senator",
    "Arkansas state senator",
    "state senator since",
    "Ohio state senator",
    "New state senator",
    "former senator",
    "shadow senator",
    "state senator",
    "senator from",
    "senator",
    "first black woman to serve in the Senate of North Carolina",
    "minority leader of the Massachusetts Senate",
    "first openly gay member of Minnesota Senate",
    "minority leader of the West Virginia Senate",
    "longest serving member of the State Senate",
    "member of the Senate from until his death",
    "member of the Senate representing Western",
    "member of the Senate for the th District",
    "member of the Massachusetts State Senate",
    "member of the Senate for North Carolina",
    "member of the Senate from Massachusetts",
    "first woman to run for a Senate seat in",
    "member of the Connecticut State Senate",
    "member of the Senate for Delta Central",
    "member of the Washington State Senate",
    "member of the Wisconsin State Senate",
    "member of the Tennessee State Senate",
    "Member of the Minnesota State Senate",
    "member of the North Carolina Senate",
    "member of the Illinois State Senate",
    "member of the Delaware Senate since",
    "member of the Massachusetts Senate",
    "member of the West Virginia Senate",
    "member of the New Hampshire Senate",
    "president of the Utah State Senate",
    "member of the Senate from Michigan",
    "member of the Senate from Virginia",
    "member of the South Dakota Senate",
    "member of the North Dakota Senate",
    "member of the Oregon State Senate",
    "Member of the South Dakota Senate",
    "first Asian elected to the Senate",
    "member of the Connecticut Senate",
    "member of the Senate of Virginia",
    "member of the State Senate since",
    "member of the Washington Senate",
    "member of the Utah State Senate",
    "member of Illinois State Senate",
    "member of the Iowa Senate since",
    "member of the Ohio Senate since",
    "member of the Minnesota Senate",
    "member of the Tennessee Senate",
    "member of the Wisconsin Senate",
    "member of West Virginia Senate",
    "years in Maryland State Senate",
    "member of the Michigan Senate",
    "member of the Illinois Senate",
    "member of the Oklahoma Senate",
    "member of the Maryland Senate",
    "member of the Missouri Senate",
    "member of the Virginia Senate",
    "member of the Arkansas Senate",
    "member of the Colorado Senate",
    "parliamentarian of the Senate",
    "member of the Delaware Senate",
    "member of the Montana Senate",
    "member of the Indiana Senate",
    "member of the Wyoming Senate",
    "member of the Alabama Senate",
    "member of the Arizona Senate",
    "member of the Senate for New",
    "member of the Vermont Senate",
    "member of the Alaska Senate",
    "member of the Kansas Senate",
    "member of the Nevada Senate",
    "member of the Oregon Senate",
    "member of the Hawaii Senate",
    "member of the State Senate",
    "member of the Maine Senate",
    "member of the Idaho Senate",
    "Member of the State Senate",
    "member of the Ohio Senate",
    "member of the Iowa Senate",
    "Member of the Ohio Senate",
    "member of Virginia Senate",
    "member of the Utah Senate",
    "member of the New Senate",
    "member of the Senate of",
    "president of the Senate",
    "Chaplain of the Senate",
    "Speaker of the Senate",
    "member of the Senate",
    "Member of the Senate",
    "Clerk of the Senate",
    "member of Senate",
    "Maryland Senate",
    "Colorado Senate",
    "Alabama Senate",
    "prime minister of the National Assembly",
    "spouse of the deputy prime minister",
    "prime minister of Kurdistan Region",
    "spouse of the prime minister",
    "first deputy prime minister",
    "former prime minister of",
    "prime minister of Crimea",
    "prime minister of Hesse",
    "prime minister of South",
    "deputy prime minister",
    "former prime minister",
    "acting prime minister",
    "prime minister",
    "ambassador to the United Nations for Special Political Affairs",
    "ambassador of the Sovereign Military Order of to St Lucia",
    "ambassador to during capture of Adolf Eichmann",
    "ambassador to the Democratic Republic of the",
    "former ambassador to the United Nations and",
    "ambassador to the Asian Development Bank",
    "ambassador to the United Nations",
    "first female Arab ambassador",
    "ambassador to Czechoslovakia",
    "ambassador to the UN Geneva",
    "ambassador to the Holy See",
    "ambassador to the Republic",
    "ambassador of South to the",
    "last ambassador of to the",
    "ambassador extraordinary",
    "ambassador to East y and",
    "ambassador to the Union",
    "and ambassador to the",
    "ambassador to The and",
    "ambassador to Arabia",
    "second ambassador to",
    "ambassador to West y",
    "ambassador to land",
    "ambassador to the",
    "ambassador to and",
    "and ambassador to",
    "ambassador to ia",
    "ambassador to s",
    "ambassador to y",
    "ambassador to",
    "UN ambassador",
    "ambassador",
    "patriarch of family of jazz musicians",
    "First Deputy Minister of Internal Affairs of the Union",
    "Minister of Agriculture in the Walloon Government",
    "Chairman of the Council of Ministers of the SSR",
    "Minister of the Central Office of Coordination",
    "National Authority Minister for Jewish Affairs",
    "Interior Minister for the Republic of Dagestan",
    "member of the Council of Ministers of East y",
    "Chief Minister of the North Central Province",
    "first post Communist Minister of Agriculture",
    "Finance Minister of the National Authority",
    "Chief Minister of the Northern Territory",
    "Minister for Foreign Affairs of the GDR",
    "Minister Plenipotentiary of the Estates",
    "Minister for International Cooperation",
    "first Deputy Chief Minister of Sarawak",
    "Minister of Human Resource Development",
    "Minister of Justice of Czechoslovakia",
    "Minister of State for Cabinet Affairs",
    "Minister of State for Foreign Affairs",
    "secretary of the Council of Ministers",
    "Minister of the Electronics Industry",
    "Minister of Foreign Affairs in exile",
    "Minister of the Department of Health",
    "Minister of Environmental Protection",
    "former Chief Minister of Johor state",
    "Minister of the Emirate for Aviation",
    "Chairman of the Council of Ministers",
    "Chief Minister of Arunachal Pradesh",
    "Federal Minister for Social Affairs",
    "three time Chief Minister of Kerala",
    "first female Minister of Transport",
    "New South Minister for Agriculture",
    "Minister of Cooperative Governance",
    "Minister of Environmental Affairs",
    "Minister of Parliamentary Affairs",
    "Minister for Economic Development",
    "Minister for Overseas Departments",
    "Nova Scotia Minister of Transport",
    "Chief Minister of Madhya Pradesh",
    "Vice Minister of Foreign Affairs",
    "Minister of Tourism for Columbia",
    "Minister of Northern Development",
    "State Minister for Legal Affairs",
    "Minister of Regional Integration",
    "Minister of Finance of the Union",
    "Minister of Agriculture of Lévis",
    "Chief Minister of Andhra Pradesh",
    "Minister of International Trade",
    "Finance Minister of Uttarakhand",
    "Chief Minister of Uttar Pradesh",
    "Federal Minister of Information",
    "Minister of Social Development",
    "Minister of Religious Services",
    "Foreign Minister of Kingdom of",
    "Minister of National Education",
    "Deputy Minister of Agriculture",
    "Minister of State for Railways",
    "Federal Minister for Economics",
    "Minister of External Relations",
    "Minister of Religious Affairs",
    "Chief Minister of Maharashtra",
    "Minister of Rural Development",
    "Chief Minister of West Bengal",
    "Minister of Economic Planning",
    "Québec Minister for Transport",
    "Deputy Minister for Transport",
    "Chief Minister of Uttarakhand",
    "Chief Minister of Balochistan",
    "Federal Minister of Education",
    "Deputy Minister for Hospitals",
    "Minister for Foreign Affairs",
    "Minister of Internal Affairs",
    "Minister of Local Government",
    "Chief Minister of Puducherry",
    "Minister of Economic Affairs",
    "Minister of Higher Education",
    "Minister of External Affairs",
    "Minister of Defence Industry",
    "Manitoba Minister of Finance",
    "Minister of Finance in Hesse",
    "Minister for the Environment",
    "Deputy Chief Minister of Goa",
    "Victorian Minister for Sport",
    "Minister of Buddhist Affairs",
    "Minister of Military Affairs",
    "Minister of People Education",
    "Minister of Foreign Affairs",
    "Minister of the Environment",
    "Chief Minister of Karnataka",
    "Chief Minister of Meghalaya",
    "Minister for Social Affairs",
    "Chief Minister of Gazankulu",
    "Minister for Administration",
    "former Minister of Industry",
    "Minister of Social Affairs",
    "Minister of Transportation",
    "Minister of Communications",
    "Minister of Public Service",
    "Minister without portfolio",
    "Minister of Civil Aviation",
    "Maharashtra Chief Minister",
    "Minister of Earth Sciences",
    "Federal Minister of Health",
    "Minister of Light Industry",
    "Deputy Minister of Culture",
    "State Minister of Industry",
    "former NWFP Chief Minister",
    "Chief Minister of Nagaland",
    "Minister of Domestic Trade",
    "Chief Minister of Manipur",
    "Chief Minister of Sarawak",
    "Chief Minister of Haryana",
    "Chief Minister of Gujarat",
    "Chief Minister of Tripura",
    "former Minister for Sport",
    "Minister of Communication",
    "Minister of Union Affairs",
    "Minister of Coal Industry",
    "Minister for Home Affairs",
    "Minister of the Fisheries",
    "Chief Minister of Mizoram",
    "Minister of the Interior",
    "Minister of Home Affairs",
    "Chief Minister of Odisha",
    "Chief Minister of Punjab",
    "Vice Minister of Tourism",
    "first Senior Minister of",
    "Chief Minister of Sikkim",
    "Deputy Chief Minister of",
    "Minister of Public Works",
    "Minister for Development",
    "Minister for the Economy",
    "Chief Minister of Kerala",
    "Chief Minister of Penang",
    "Sindh Minister of Sports",
    "former Interior Minister",
    "Minister for Agriculture",
    "Minister of Agriculture",
    "Minister of Environment",
    "Minister of Information",
    "Chief Minister of Assam",
    "Chief Minister of Bihar",
    "Chief Minister of Delhi",
    "Minister of Integration",
    "Minister of Immigration",
    "Chief Minister of Sabah",
    "Minister for Minorities",
    "Social Affairs Minister",
    "Transportation Minister",
    "Minister for Education",
    "Chief Minister of Nadu",
    "Minister of Presidency",
    "Minister for Economics",
    "Minister for Community",
    "Minister of Technology",
    "NSW Minister for Sport",
    "Minister of Education",
    "Minister of Transport",
    "Minister of Petroleum",
    "Minister of Fisheries",
    "deputy First Minister",
    "Minister of Economics",
    "Chief Minister of Goa",
    "Minister of Industry",
    "Minister of Interior",
    "Minister for Finance",
    "Minister of Commerce",
    "Minister of Railways",
    "Minister for Housing",
    "Minister of Children",
    "Information Minister",
    "Agriculture Minister",
    "Minister of Forestry",
    "Interior Minister of",
    "Minister of Finance",
    "Minister of Justice",
    "Minister of Culture",
    "Minister of Economy",
    "Minister of Tourism",
    "Minister for Health",
    "Minister of Science",
    "Minister of Affairs",
    "Minister of Housing",
    "Minister for Energy",
    "Minister of Primary",
    "Minister of culture",
    "Minister of Health",
    "Minister of Energy",
    "Minister of Labour",
    "Minister of Gender",
    "Minister of Supply",
    "Petroleum Minister",
    "Interior Minister",
    "Minister of State",
    "Minister of Trade",
    "Chief Minister of",
    "Minister of Awqaf",
    "Minister of Labor",
    "Minister of Works",
    "Minister of Steel",
    "Minister of Mines",
    "Minister of Taxes",
    "Minister of Posts",
    "Minister of Youth",
    "Commerce Minister",
    "Minister of Lands",
    "Minister of Water",
    "Foreign Minister",
    "Finance Minister",
    "Cabinet Minister",
    "Minister of Food",
    "Minister of Land",
    "Minister for Air",
    "Tourism Minister",
    "Minister of Law",
    "Health Minister",
    "Minister of Oil",
    "Prime Minister",
    "Chief Minister",
    "State Minister",
    "Home Minister",
    "Law Minister",
    "Tax Minister",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "inadvertently alerted authorities to the Profumo affair",
    'and author of the enduring song "Bésame mucho"',
    'author of the "Surudi Milli" national anthem',
    "author of definitive work on Pride Purge",
    "author of atlases of fictional worlds",
    "author of book about The Soccer War",
    "author on ancient maritime history",
    'author of "Last Exit to Brooklyn"',
    'author of "The Cone Gatherers"',
    "author of over children books",
    "prize winning author",
    "non fiction author",
    "game author",
    "and author",
    "journalist",
    "son of The O'Jays lead singer Eddie Levert",
    "singer",
    "actor",
    "director general of GEM TV",
    "BBC first female producer",
    "producer of The Chiffons",
    "Oscar winning producer",
    "television producer",
    "theatrical producer",
    "audio book producer",
    "and record producer",
    "CBS News producer",
    "record producer",
    "music producer",
    "radio producer",
    "opera producer",
    "film producer",
    "and producer",
    "TV producer",
    "producer",
    "known for the Jaz Parks series of fantasy novels",
    "television adaptation of novel:",
    "wrote first romance novel",
    "detective novels",
    "fantasy novelist",
    "wrote novels",
    "novelist",
    "novels",
    'considered "the most beautiful face in the history of cinema"',
    "vocal coach",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "President of the International Gymnastics Federation",
    "President of the International Boxing Association",
    "President of the International Cricket Council",
    "President of International Cricket Council",
    "President of the Marylebone Cricket Club",
    "President of the Basketball Association",
    "President of World Taekwondo Federation",
    "President of the Athletics Association",
    "President of the Athletic Association",
    "President of the Paralympic Committee",
    "President of the World Boxing Council",
    "President of the Football Federation",
    "President of Ligue corse de football",
    "President of the All Judo Federation",
    "President of the Archery Federation",
    "President of Minor League Baseball",
    "President of the Hockey League",
    "President of Neuchâtel Xamax",
    "President of Real Madrid",
    "President of Bundesliga",
    "President of CONMEBOL",
    "President of the NZRU",
    "President of the UIAA",
    "President of the LPGA",
    "President of the UCI",
    "President of the NHL",
    "President of Avaí FC",
    "President of Auxerre",
    "President of FIFA",
    "President of FIDE",
    "President of CFA",
    "President of AFA",
    "football director",
    "four time Grand Slam tournament singles champion",
    "seven time NASCAR Whelen Modified Tour champion",
    "member of Aston Villa championship winning team",
    "four time Iditarod Trail Sled Dog Race champion",
    "coached five teams to national championships",
    "Muay thai world championship silver medalist",
    "NASCAR Busch Grand National Series champion",
    "two time NASCAR car owner points champion",
    "member of World Series championship teams",
    "WBC youth silver featherweight champion",
    "Amateur Athletic Union hurdle champion",
    "three time major championship winner",
    "two time welterweight world champion",
    "world champion in motor paced racing",
    "two time middleweight world champion",
    "WIBF world light flyweight champion",
    "WBA WBC world welterweight champion",
    "world championship silver medalist",
    "world championship bronze medalist",
    "world amateur heavyweight champion",
    "former world bantamweight champion",
    "six time Grand Prix world champion",
    "triple Formula One world champion",
    "WBA junior welterweight champion",
    "WBA super featherweight champion",
    "undisputed middleweight champion",
    "NABF super welterweight champion",
    "world light heavyweight champion",
    "WBC light middleweight champion",
    "WBC light welterweight champion",
    "WBA Light Middleweight champion",
    "four time ECW tag team champion",
    "four time Stanley Cup champion",
    "WBC super lightweight champion",
    "Winston Racing Series champion",
    "five time Stanley Cup champion",
    "female UCI Road World champion",
    "eight time Muay world champion",
    "two time Indianapolis champion",
    "WBA WBC featherweight champion",
    "six time Stanley Cup champion",
    "four time Paralympic champion",
    "championship bronze medalist",
    "national individual champion",
    "half marathon world champion",
    "track pursuit world champion",
    "middleweight boxing champion",
    "five times national champion",
    "UCI World Road Race champion",
    "championship silver medalist",
    "Commonwealth Games champion",
    "world welterweight champion",
    "super middleweight champion",
    "K World Grand Prix champion",
    "world bantamweight champion",
    "national road race champion",
    "four time national champion",
    "Tournament Players champion",
    "Formula One world champion",
    "four weight world champion",
    "WBA featherweight champion",
    "world cyclo cross champion",
    "first female SCCA champion",
    "time Stanley Cup champion",
    "three time chess champion",
    "Formula TT world champion",
    "and Commonwealth champion",
    "National doubles champion",
    "world championship winner",
    "three time world champion",
    "world championship silver",
    "WBC middleweight champion",
    "undisputed world champion",
    "WBA bantamweight champion",
    "five time world champion",
    "world flyweight champion",
    "nine time world champion",
    "WBA heavyweight champion",
    "Greco wrestling champion",
    "six time world champion",
    "two time world champion",
    "championed LCD displays",
    "WBA flyweight champion",
    "Formula Three champion",
    "time national champion",
    "WBA IBF world champion",
    "World Juniors champion",
    "triples world champion",
    "Commonwealth champion",
    "middleweight champion",
    "Titleholders champion",
    "ten time NBA champion",
    "welterweight champion",
    "bantamweight champion",
    "world junior champion",
    "Keirin world champion",
    "Labatt Brier champion",
    "Asian Games champion",
    "heavyweight champion",
    "world rally champion",
    "world youth champion",
    "women chess champion",
    "South Games champion",
    "Paralympic champion",
    "Grand Prix champion",
    "road world champion",
    "USSR chess champion",
    "GP Series champion",
    "Wimbledon champion",
    "four time champion",
    "flyweight champion",
    "Badminton champion",
    "high jump champion",
    "Pan Games champion",
    "national champion",
    "PGA Tour champion",
    "six time champion",
    "amateur champion",
    "champion of USSR",
    "indoor champion",
    "world champion",
    "Rally champion",
    "Guksu champion",
    "World champion",
    "South champion",
    "Open champion",
    "team champion",
    "PGA champion",
    "WBA champion",
    "WBC champion",
    "champion",
    "electronic musician",
    "session musician",
    "folk musician",
    "retired in as the winningest high school football coach in the nation",
    "head coach for eight different NHL teams from to",
    "interim head coach at UCLA for three games in",
    "coached five teams to national championships",
    "head coach of New Hampshire Wildcats from to",
    "first Division I college basketball coach",
    "coached national team to only World Cup",
    "former national football team coach",
    "father of ers coach Mike Nolan",
    "former NFL head coach",
    "and MLB hitting coach",
    "coach in three sports",
    "and athletics coach",
    "goalkeeping coach",
    "football coach",
    "pitching coach",
    "and coach",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "one of the world foremost authorities on",
    "father of islet cell transplantation for treatment of Type I diabetes",
    "contributed to the understanding of neurotransmitters in the brain",
    "discovered further role of LIF in brain functioning",
    "Nobel Prize winner in physics for the development of neutron spectroscopy",
    "recipient of the Nobel Prize for Physiology or Medicine",
    "recipient of the Nobel Prize in Physiology or Medicine",
    "laureate of the Nobel Prize in Physiology or Medicine",
    "Nobel Prize laureate in Physiology or Medicine",
    "and co winner of the Nobel Prize for Chemistry",
    "recipient of the Nobel Prize in Chemistry",
    "Nobel laureate in Physiology or Medicine",
    "laureate of the Nobel Prize in Chemistry",
    "recipient of the Nobel Prize in Physics",
    "announced as Nobel Laureate in Medicine",
    "laureate of the Nobel Prize in Physics",
    "laureate of Nobel Prize in Chemistry",
    "winner of the Nobel Prize in Physics",
    "winner of Nobel Prize for Physics",
    "winner of Nobel Prize in Physics",
    "chairman of the Nobel Committee",
    "Nobel Prize laureate in Physics",
    "Nobel Prize winner in chemistry",
    "Nobel laureate for work on DNA",
    "won Nobel Prize in Physics in",
    "Nobel laureate in Chemistry",
    "Nobel Laureate in Chemistry",
    "Nobel laureate in medicine",
    "Nobel Laureate in Medicine",
    "Nobel laureate in physics",
    "Nobel Prize in Medicine",
    "Nobel Prize in Physics",
    "co creator of the Saffir Simpson Hurricane Scale;",
    "and a pioneer in organ transplant surgery",
    "father of modern orthognathic surgery",
    "specialist in Tommy John surgery",
    "invented Tommy John surgery",
    "pioneer of heart surgery",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "chief executive",
    "corporate director",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "co author of the broken windows theory",
    "President of State University",
    "deputy party secretary of Tsinghua University",
    "director of the Nehru Memorial Museum & Library",
    "director of the Revolution Museum",
    "director general of ITER",
    "helped to establish social history as a major area of study",
    "architectural historian",
    "historian of ",
    "Holocaust historian",
    "literary historian",
    "military historian",
    "historian of ideas",
    "social historian",
    "media historian",
    "oral historian",
    "art historian",
    "historian",
    "academician of the Academy of Engineering",
    "academician of the Academy of Sciences",
    "one of leading political academics",
    "academic",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "executioner of Che Guevara",
    "deputy Minister of Defense",
    "director of General Intelligence Directorate",
    "director of the National Security Service",
    "director general of the Civil Guard",
    "director general of ASIO",
    "director general of MI",
    "Deputy Chief of Staff of the People Liberation Army",
    "Deputy Chief Constable of Greater Manchester Police",
    "Deputy Commissioner of Police of the Metropolis",
    "Deputy Director of the National Security Agency",
    "Deputy Supreme Allied Commander Atlantic",
    "Deputy Commander in Chief Strike Command",
    "Deputy Commander of RAF Strike Command",
    "Deputy Inspector General of the Army",
    "Deputy Chief of the Defence Staff",
    "Deputy Supreme Allied Commander",
    "Deputy Director of the NSA",
    "Deputy Minister of Defence",
    "Deputy Chief of Defence",
    "Vice Minister of the State Security Department",
    "last Defence Minister of Czechoslovakia",
    "Minister of Narcotics Control",
    "Minister for National Defense",
    "Minister of National Security",
    "Minister of National Defence",
    "Deputy Minister of Defence",
    "deputy Minister of Defense",
    "Minister of Defence",
    "Minister of Defense",
    "Defence Minister",
    "East Minister of National Defence",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "moderator of the General Assembly of the Presbyterian Church of Aotearoa",
    "Moderator of the General Assembly of the Church of",
    "moderator of the General Assembly",
    "Minister General of the Order of Friars Minor",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "national legal director for the Atheists",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "for ordering a contract killing",
    " for killing people in",
    "suspected murderer of prime minister Olof Palme",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "subject of international campaign against her execution",
    "protagonist in airplane hijacking",
    "had one of the most studied human brains",
    "subject of brain science study",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "leukemia",
    "executed by lethal injection",
    "executed by electric chair",
    "executed by lethal injection",
    "executed in the using pentobarbital",
    "executed by lethal injection in Washington",
    "executed by lethal injection in Maryland",
    "executed by lethal injection in Arkansas",
    "executed by lethal injection in Oklahoma",
    "executed by lethal injection in Virginia",
    "executed by electric chair in Alabama",
    "executed in San Quentin State Prison",
    "execution in by lethal injection",
    "executed by lethal injection in",
    "execution by lethal injection",
    "executed by lethal injection",
    "execution by electric chair",
    "execution by electrocution",
    "executed by electric chair",
    "execution by firing squad",
    "executed by firing squad",
    "executed by the Taliban",
    "execution by beheading",
    "executed by beheading",
    "execution by shooting",
    "execution by hanging",
    "executed in Arkansas",
    "executed in Maryland",
    "executed in Missouri",
    "executed by hanging",
    "executed in Indiana",
    "summary execution",
    "executed in",
    "executed",
    "murdered",
    "fall",
    "plane crash",
    "complications from perforated ulcer surgery",
    "complications from heart surgery",
    "complications from brain surgery",
    "complications from stab wounds",
    "complications of a broken hip",
    "complications from dementia",
    "complications after surgery",
    "AIDS related complications",
    "complications of diabetes",
    "age related complications",
    "complications of dementia",
    "complications of surgery",
    "pulmonary complications",
    "desmoplastic small round cell tumor",
    "complications from a brain tumor",
    "complications from tumor surgery",
    "gastrointestinal stromal tumor",
    "complications of brain tumor",
    "bleeding from spinal tumor",
    "brain tumor complications",
    "malignant brain tumor",
    "benign brain tumors",
    "Krukenberg tumor",
    "malignant tumor",
    "spinal tumor",
    "brain tumor",
    "chest tumor",
    "tumor",
    "complications from amyotrophic lateral sclerosis",
    "complications from amytrophic lateral sclerosis",
    "complications from multiple sclerosis",
    "amyotrophic lateral sclerosis",
    "cerebral arteriosclerosis",
    "primary lateral sclerosis",
    "coronary atherosclerosis",
    "multiple sclerosis",
    "arterial sclerosis",
    "arteriosclerosis",
    "atherosclerosis",
    "multiple myeloma",
    "myeloma",
    "drowning due to alcohol intoxication",
    "drowning as a result of a seizure",
    "apparent drowning",
    "presumed drowned",
    "drowning",
    "drowned",
    "euthanised due to complications from colic surgery",
    "euthanized after complications from colic",
    "euthanasia after contracting laminitis",
    "euthanised following a twisted bowel",
    "euthanized following laminitis",
    "euthanized after race injury",
    "voluntary euthanasia",
    "passive euthanasia",
    "euthanasied",
    "euthanized",
    "euthanised",
    "euthanasia",
    "pulmonary emphysema",
    "advanced emphysema",
    "emphysema",
    "complications from frontotemporal dementia",
    "complications from Lewy body dementia",
    "complications from vascular dementia",
    "complications of Lewy body dementia",
    "traumatic dementia encephalophathy",
    "complications of vascular dementia",
    "complications from dementia",
    "complications of dementia",
    "frontotemporal dementia",
    "Lewy body dementia",
    "vascular dementia",
    "senile dementia",
    "dementia",
    "complications from melanoma",
    "metastatic melanoma",
    "malignant melanoma",
    "ocular melanoma",
    "melanoma",
    "post operative multiple cerebral infarction",
    "complications from intracerebral hemorrhage",
    "complications from a cerebral hemorrhage",
    "cerebral infarction during back surgery",
    "complications from cerebral hemorrhage",
    "cerebral amyloid angiopathy",
    "massive cerebral hemorrhage",
    "ruptured cerebral aneurysm",
    "cerebral arteriosclerosis",
    "intracerebral hemorrhage",
    "post cerebral aneurysm",
    "intracerebral bleeding",
    "cerebral haemorrhage",
    "cerebral hemorrhage",
    "cerebral infarction",
    "cerebral thrombosis",
    "cerebral contusion",
    "cerebral aneurysm",
    "cerebral bleeding",
    "cerebral embolism",
    "cerebral disorder",
    "cerebral malaria",
    "cerebral atrophy",
    "cerebral infarct",
    "cerebral edema",
    "complications from a ruptured brain aneurysm",
    "complications of boxing induced brain damage",
    "complications from ruptured brain aneurysm",
    "complications of traumatic brain injury",
    "complications from a brain haemorrhage",
    "complications following brain surgery",
    "complications from a brain infection",
    "complications from brain haemorrhage",
    "complications from a brain aneurysm",
    "complications from brain hemorrhage",
    "brain injury sustained during match",
    "brain injury sustained during fight",
    "complications from a brain tumour",
    "complications from brain surgery",
    "complications from a brain tumor",
    "brain injuries sustained in bout",
    "complications from brain injury",
    "intracranial brain hemorrhage",
    "complications of brain injury",
    "complications of brain tumor",
    "brain ischemia during game",
    "cystic growth on the brain",
    "brain tumor complications",
    "complications from brain",
    "massive brain hemorrhage",
    "astrocytoma brain tumour",
    "ruptured brain aneurysm",
    "malignant brain tumour",
    "malignant brain tumor",
    "benign brain tumors",
    "brain inflammation",
    "brain haemorrhage",
    "brain hemorrhage",
    "brain infection",
    "brain aneurysm",
    "brain aneurism",
    "brain thrombus",
    "brain injuries",
    "brain seizure",
    "brain hypoxia",
    "brain tumour",
    "brain injury",
    "brain trauma",
    "brain tumor",
    "brain virus",
    "complications from pulmonary embolism",
    "suspected pulmonary embolism",
    "pulmonary embolism",
    "suspected embolism",
    "coronary embolism",
    "cerebral embolism",
    "embolism",
    "pulmonary edema while climbing Annapurna",
    "pulmonary aspiration from drug overdose",
    "complications from pulmonary emphysema",
    "complications from pulmonary fibrosis",
    "complications from pulmonary embolism",
    "complications from pulmonary edema",
    "idiopathic pulmonary hypertension",
    "idiopathic pulmonary fibrosis",
    "suspected pulmonary embolism",
    "pulmonary artery dissection",
    "pulmonary complications",
    "pulmonary calcification",
    "cardiopulmonary arrest",
    "pulmonary hypertension",
    "acute pulmonary oedema",
    "pulmonary thrombosis",
    "pulmonary congestion",
    "pulmonary emphysema",
    "pulmonary infection",
    "pulmonary embolism",
    "pulmonary fibrosis",
    "pulmonary disorder",
    "pulmonary distress",
    "pulmonary edema",
    "pulmonary",
    "complications from surgery",
    "complications of surgery for artificial hip infection",
    "infection following heart valve replacement surgery",
    "euthanised due to complications from colic surgery",
    "complications following abdominal aneurysm surgery",
    "complications from coronary artery bypass surgery",
    "complications of neck surgery for osteoarthritis",
    "complications following lung transplant surgery",
    "peritonitis after undergoing surgery from colic",
    "complications during liver transplant surgery",
    "complications from breast enlargement surgery",
    "complications following gall bladder surgery",
    "complications following heart bypass surgery",
    "complications from gastrointestinal surgery",
    "infection following hip replacement surgery",
    "complications after hip replacement surgery",
    "complications from knee replacement surgery",
    "complications from perforated ulcer surgery",
    "complications of paralysis related surgery",
    "complications following open heart surgery",
    "complications of liver transplant surgery",
    "complications from gastric bypass surgery",
    "complications during gall bladder surgery",
    "complications from hiatal hernia surgery",
    "complications following shoulder surgery",
    "complications of hip replacement surgery",
    "complications relating to heart surgery",
    "cerebral infarction during back surgery",
    "bacterial infection after heart surgery",
    "complications after intestinal surgery",
    "complications from heart valve surgery",
    "complications after open heart surgery",
    "complications following throat surgery",
    "complications from gallbladder surgery",
    "complications following heart surgery",
    "complications from open heart surgery",
    "complications of gall bladder surgery",
    "complication after open heart surgery",
    "complications following brain surgery",
    "complications from intestinal surgery",
    "blood clot from surgery complications",
    "complications during cosmetic surgery",
    "following surgery for aortic aneurysm",
    "complications from bariatric surgery",
    "complications from bile duct surgery",
    "from complications following surgery",
    "complications from abdominal surgery",
    "complications following hip surgery",
    "complications from cosmetic surgery",
    "complications after plastic surgery",
    "complications following leg surgery",
    "complications from elective surgery",
    "complications of intestinal surgery",
    "complications from prostate surgery",
    "complications during heart surgery",
    "complications from stomach surgery",
    "complications after spleen surgery",
    "complications from thyroid surgery",
    "complications from hernia surgery",
    "complications from kidney surgery",
    "complications after heart surgery",
    "complications from bypass surgery",
    "haemorrhage during hernia surgery",
    "complications from aortic surgery",
    "complications from heart surgery",
    "complications from brain surgery",
    "complications from liver surgery",
    "complications from tumor surgery",
    "open heart surgery complications",
    "complications from colon surgery",
    "complications following surgery",
    "complications from back surgery",
    "complications from lung surgery",
    "complications of spinal surgery",
    "complications from knee surgery",
    "complication from heart surgery",
    "complications from hip surgery",
    "complications of heart surgery",
    "during emergency heart surgery",
    "complication following surgery",
    "blood clot after heart surgery",
    "complications of back surgery",
    "complications during surgery",
    "complications of hip surgery",
    "following open heart surgery",
    "complications after surgery",
    "complication during surgery",
    "complications from surgery",
    "complications of surgery",
    "post surgery haemorrhage",
    "following heart surgery",
    "surgery complications",
    "after heart surgery",
    "hip surgery",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [662]:
# Dropping info_3_0 value for entries with redundant value
df.loc[
    [
        index
        for index in df[df["info_3_0"].notna()].index
        if df.loc[index, "info_3_0"] == "executive"
    ],
    "info_3_0",
] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [663]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [664]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 22463 values in cause_of_death column.

CPU times: total: 4min 50s
Wall time: 4min 50s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [665]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    87550
2    10231
3      221
0       39
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [667]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1024]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [1025]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "Governor" in df.loc[index, "info_3_0"]
#             and df.loc[index, "politics_govt_law"] == 1
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1026]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1027]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "Military Governor of Kano State"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [1028]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Duchess of Schleswig Holstein",
    "Duchess of Medinaceli",
    "Duchess of York",
    "President of the Aquinnah Wampanoag of Gay Head",
    "deputy governor of the Reserve Bank of",
    "deputy secretary of commerce",
    "deputy secretary of health",
    "deputy press secretary",
    "national deputy",
    "Deputy Prosecutor of the International Criminal Tribunal for",
    "Deputy Chairman of the Federal Reserve Board",
    "former Deputy Speaker of the House of Lords",
    "Deputy Speaker of the National Assembly",
    "Deputy Speaker of the National Congress",
    "Deputy Governor of the Reserve Bank of",
    "Deputy Speaker of the House of Commons",
    "First Deputy Head of the Organization",
    "Deputy Speaker of the House of Lords",
    "Deputy Leader of the House of Lords",
    "First Deputy Premier of the Union",
    "Deputy Leader of the Labour Party",
    "Deputy National Security Advisor",
    "Deputy Mayor of Rio de Janeiro",
    "Deputy Administrator of NASA",
    "Deputy Premier of Queensland",
    "Deputy Premier of New South",
    "Deputy Clerk of the Closet",
    "Deputy Premier of Victoria",
    "Deputy Premier of Western",
    "National Assembly Deputy",
    "Deputy Attorney General",
    "Deputy Mayor of Tallinn",
    "Deputy Premier of South",
    "Deputy Mayor for Policy",
    "Deputy Chief of Staff",
    "Deputy of Congress",
    "first woman Deputy",
    "Deputy Governor",
    "Regional Deputy",
    "Deputy Mayor of",
    "Deputy Premier",
    "People Deputy",
    "Deputy elect",
    "member of the Legislative Assembly of Alberta for Clover Bar Fort Saskatchewan",
    "member of the House of Peoples of the Federal Assembly of Czechoslovakia",
    "member of the Legislative Assembly of Columbia for Burnaby Willingdon",
    "Leader of the Opposition in the Legislative Assembly of New Brunswick",
    "member of the Legislative Assembly of the Northwest Territories",
    "former member of the State Assembly representing Staten Island",
    "member of the Newfoundland House of Assembly for Port de Grave",
    "member of the New South Legislative Assembly for Campbelltown",
    "member of the New South Legislative Assembly for Tenterfield",
    "member of the Queensland Legislative Assembly for Chatsworth",
    "member of the New South Legislative Assembly for Parramatta",
    "member of the Queensland Legislative Assembly for Mooloolah",
    "member of the Queensland Legislative Assembly for Salisbury",
    "member of the Victorian Legislative Assembly for Northcote",
    "member of the New South Legislative Assembly for Pittwater",
    "member of the Victorian Legislative Assembly for Frankston",
    "member of the Wisconsin State Assembly for the th District",
    "member of the New South Legislative Assembly for Maroubra",
    "member of the New South Legislative Assembly for Vaucluse",
    "member of the Queensland Legislative Assembly for Gregory",
    "member of the New South Legislative Assembly for Woronora",
    "member of the New South Legislative Assembly for Gosford",
    "member of the Queensland Legislative Assembly for Ithaca",
    "member of the Western Legislative Assembly for Ballajura",
    "member of the New South Legislative Assembly for Lakemba",
    "first openly gay member of the Illinois General Assembly",
    "member of the New South Legislative Assembly for Yaralla",
    "member of the Victorian Legislative Assembly for Preston",
    "member of the Queensland Legislative Assembly for Keppel",
    "member of the Western Legislative Assembly for Kimberley",
    "Member of the Victorian Legislative Assembly for Balwyn",
    "member of the Legislative Assembly of for Ottawa—Vanier",
    "member of the Northern Territory Legislative Assembly",
    "member of the Capital Territory Legislative Assembly",
    "member of the Legislative Assembly of Kamchatka Krai",
    "member of the Legislative Assembly of Andhra Pradesh",
    "member of the Legislative Assembly of Manitoba since",
    "member of the New South Legislative Assembly from to",
    "member of the Virginia General Assembly for Roanoke",
    "member of the Legislative Assembly of Saskatchewan",
    "Speaker of the Madhya Pradesh Legislative Assembly",
    "first woman elected to the Legislative Assembly of",
    "member of the Legislative Assembly of for Yorkview",
    "Speaker of the Sarawak State Legislative Assembly",
    "member of the Madhya Pradesh Legislative Assembly",
    "Speaker of the Legislative Assembly of Queensland",
    "member of the NT Legislative Assembly for Arafura",
    "member of the Nova Scotia House of Assembly since",
    "member of the Legislative Assembly of for Lincoln",
    "member of the Legislative Assembly of Queensland",
    "member of the BC Legislative Assembly for Skeena",
    "member of the Northern Assembly for North Antrim",
    "member of the South House of Assembly for Fisher",
    "member of the Provincial Assembly of the Punjab",
    "Speaker of the West Bengal Legislative Assembly",
    "Speaker of the Maharashtra Legislative Assembly",
    "member of the Legislative Assembly of Columbia",
    "member of the West Bengal Legislative Assembly",
    "member of the Legislative Assembly of Manitoba",
    "member of the Maharashtra Legislative Assembly",
    "Member of the National Assembly for Mont Royal",
    "Speaker of the Legislative Assembly of Alberta",
    "member of the Uttarakhand Legislative Assembly",
    "member of the Johor State Legislative Assembly",
    "last female member of the Constituent Assembly",
    "member of the Legislative Assembly of Alberta",
    "member of the North Carolina General Assembly",
    "member of the Queensland Legislative Assembly",
    "member of Andhra Pradesh Legislative Assembly",
    "sixth vice president of the National Assembly",
    "member of the New South Legislative Assembly",
    "member of the Victorian Legislative Assembly",
    "member of the Rajasthan Legislative Assembly",
    "member of the Jigawa State House of Assembly",
    "member of the Nova Scotia House of Assembly",
    "Speaker of the Mizoram Legislative Assembly",
    "Speaker of the Ondo State House of Assembly",
    "Speaker of the People Consultative Assembly",
    "member of the Western Legislative Assembly",
    "member of the Provincial Assembly of Sindh",
    "member of the People Consultative Assembly",
    "member of Sabah State Legislative Assembly",
    "Member of the Assembly of the Union of the",
    "nd vice president of the National Assembly",
    "speaker of the Punjab Legislative Assembly",
    "Speaker of the Kerala Legislative Assembly",
    "member of the Kerala Legislative Assembly",
    "member of Legislative Assembly of Alberta",
    "member of the Tokyo Metropolitan Assembly",
    "president of the People National Assembly",
    "member of the Punjab Legislative Assembly",
    "member of the Yukon Legislative Assembly",
    "member of the New General Assembly since",
    "member of the Grand National Assembly of",
    "Deputy Speaker of the National Assembly",
    "member of the Nadu Legislative Assembly",
    "vice president of the National Assembly",
    "member of the Colorado General Assembly",
    "chairman of the Supreme People Assembly",
    "member of the Wisconsin State Assembly",
    "Speaker of the South House of Assembly",
    "Member of the Wisconsin State Assembly",
    "member of Tripura Legislative Assembly",
    "president of the Assembly of Vojvodina",
    "Speaker of the Grand National Assembly",
    "member of the Legislative Assembly of",
    "member of the Assembly of Extremadura",
    "speaker of Assam Legislative Assembly",
    "New General Assembly Minority Leader",
    "member of the Assembly of Vojvodina",
    "member of Illinois General Assembly",
    "member of the Nevada State Assembly",
    "member of the New General Assembly",
    "member of the Constituent Assembly",
    "president of the National Assembly",
    "Member of the National Assembly of",
    "first Speaker of the Niue Assembly",
    "member of Wisconsin State Assembly",
    "member of Legislative Assembly of",
    "Western Legislative Assembly whip",
    "Speaker of the National Assembly",
    "member of the Assembly of Murcia",
    "member of the Wisconsin Assembly",
    "member of the Assembly of Madrid",
    "speaker of the National Assembly",
    "Member of the National Assembly",
    "member of Assembly of Republic",
    "member of the General Assembly",
    "speaker of the People Assembly",
    "member of the Nevada Assembly",
    "member of the States Assembly",
    "Speaker of the State Assembly",
    "member of the State Assembly",
    "Member of the State Assembly",
    "member of Northern Assembly",
    "Wisconsin State Assemblyman",
    "Member of National Assembly",
    "Assam Legislative Assembly",
    "chairman of the Assembly",
    "National Assembly Deputy",
    "member of the Assembly",
    "New State Assemblyman",
    "State Assemblyman",
    "New Assemblyman",
    "State Assembly",
    "laureate of the Nobel Memorial Prize in Economic Sciences",
    "winner of the Nobel Memorial Prize in Economic Sciences",
    "laureate of the Nobel Prize in Economic Sciences",
    "laureate of the Nobel Prize in Economics",
    "Nobel Prize laureate in Economics",
    "Nobel Prize winner for economics",
    "advocate of justifiable homicide for murderers of abortion providers",
    "MEP for Midlands Central",
    "MEP for Yorkshire South",
    "MEP for Hertfordshire",
    "MEP for West Midlands",
    "MEP for Leicester",
    "MEP for Glasgow",
    "MEP for North",
    "appointed MEP",
    "MEP for West",
    "MEP for Mid",
    "MEP",
    "disputed head of the House of Romanov",
    "Deputy Speaker of the House of Lords",
    "subject of film",
    "diplomat",
    "known for provoking his Beverly Hills neighbors by applying garish paint colors",
    "member of Board of Governors of the International Atomic Energy Agency",
    "first female Governor of the Gila River Community",
    "first female candidate for Lieutenant Governor",
    "Governor of East Timor during the occupation",
    "Lieutenant Governor of Prince Edward Island",
    "Lieutenant Governor of the Virgin Islands",
    "Democratic Party nominee for Governor of",
    "Deputy Governor of the Reserve Bank of",
    "former Lieutenant Governor of Colorado",
    "Lieutenant Governor of North Carolina",
    "former Republican Governor of Wyoming",
    "Lieutenant Governor of New Brunswick",
    "candidate for Governor of Tamaulipas",
    "first elected Lieutenant Governor of",
    "last Governor of the Gilbert Islands",
    "Lieutenant Governor of Newfoundland",
    "Lieutenant Governor of Saskatchewan",
    "Lieutenant Governor of South Dakota",
    "Governor of Northern Borders Region",
    "Governor of the Territory of Alaska",
    "Lieutenant Governor of Connecticut",
    "Lieutenant Governor of Nova Scotia",
    "Lieutenant Governor of the Andaman",
    "Governor of Nizhny Novgorod Oblast",
    "Federal Reserve Board of Governors",
    "Vice Governor of Guizhou Province",
    "Governor of Buenos Aires Province",
    "Governor of Sabaragamuwa Province",
    "Governor of Hiroshima Prefecture",
    "Governor of the National Bank of",
    "Governor of the Central Province",
    "Lieutenant Governor of Minnesota",
    "Vice Governor of Yunnan Province",
    "Governor of the Federal District",
    "Governor of the Chickasaw Nation",
    "Lieutenant Governor of Wisconsin",
    "Lieutenant Governor of Tennessee",
    "governor of the Gaza Governorate",
    "Lieutenant Governor of Colorado",
    "Lieutenant Governor of Michigan",
    "Lieutenant Governor of Columbia",
    "Lieutenant Governor of Virginia",
    "Lieutenant Governor of Manitoba",
    "Governor of West Papua Province",
    "Lieutenant Governor of Missouri",
    "Governor of Kano State",
    "Governor of North Central State",
    "Governor of the Reserve Bank of",
    "Governor of Rio Grande do Norte",
    "Governor of the Central Bank of",
    "Governor of the Virgin Islands",
    "Governor of Okinawa Prefecture",
    "Governor of Khyber Pakhtunkhwa",
    "Lieutenant Governor of Montana",
    "Lieutenant Governor of Alabama",
    "Governor of the Cayman Islands",
    "Governor of Västmanland County",
    "Lieutenant Governor of Andaman",
    "Governor General of Equatorial",
    "Governor of East Nusa Tenggara",
    "Lieutenant Governor of Indiana",
    "Lieutenant Governor of Hawaii",
    "Lieutenant Governor of Kansas",
    "Lieutenant Governor of Alaska",
    "Governor of the State Bank of",
    "Governor of Eastern Macedonia",
    "Governor of Mid Western State",
    "Vice Governor of Heilongjiang",
    "Governor of Saint Christopher",
    "Governor of Khorasan Province",
    "former Governor of Pernambuco",
    "Governor of Sichuan province",
    "Governor of Vinnytsia Oblast",
    "Lieutenant Governor of Idaho",
    "Governor of Kronoberg County",
    "Governor of Misamis Oriental",
    "Governor of Tierra del Fuego",
    "Governor of Stockholm County",
    "Governor of Tucumán Province",
    "former Governor of Wisconsin",
    "Lieutenant Governor of since",
    "Governor of Himachal Pradesh",
    "Governor of the Reserve Bank",
    "Lieutenant Governor of Iowa",
    "Governor of Negros Oriental",
    "Governor of Ōita Prefecture",
    "Vermont Lieutenant Governor",
    "Lieutenant Governor of Guam",
    "Governor of Surigao del Sur",
    "County Governor of Nordland",
    "Governor of San Luis Potosí",
    "former Governor of Nebraska",
    "Governor of Møre og Romsdal",
    "Governor of Riyadh Province",
    "Governor of Espírito Santo",
    "Governor of Madhya Pradesh",
    "Governor of North Carolina",
    "Governor of Rio de Janeiro",
    "former Governor of Indiana",
    "Governor of Alaska from to",
    "Governor of Kebbi",
    "Governor of Kavango Region",
    "first Governor of Baja Sur",
    "Governor of Krasnodar Krai",
    "Governor of West Virginia",
    "Governor of Plateau State",
    "Governor of Gongola State",
    "Governor of New Hampshire",
    "Governor of Makkah Region",
    "Governor of Adamawa State",
    "Vice Governor of Shandong",
    "first Lieutenant Governor",
    "Governor of West Flanders",
    "Governor of Uttar Pradesh",
    "Governor of Massachusetts",
    "Governor of Minas Gerais",
    "Governor of Kaduna State",
    "Governor of North Dakota",
    "Governor of Nyeri County",
    "Governor of Chhattisgarh",
    "Governor of South Dakota",
    "Governor of the State of",
    "Governor of Taraba State",
    "Governor of Buenos Aires",
    "Governor for the Bank of",
    "Governor of the Bank of",
    "Governor of Uttarakhand",
    "Governor of Balochistan",
    "Governor of Connecticut",
    "acting Governor General",
    "Governor of Delta State",
    "Governor of Gombe State",
    "Lt Governor of Missouri",
    "Governor of Tula Oblast",
    "former Governor of Guam",
    "Lt Governor of Arkansas",
    "Governor of Apure State",
    "Governor of Lagos State",
    "Governor of Maguindanao",
    "Governor of Kwara State",
    "Lieutenant Governor of",
    "Governor of Washington",
    "Governor of Ondo State",
    "Governor of Kogi State",
    "Governor of Tamaulipas",
    "Governor of Marinduque",
    "Governor of Osun State",
    "th Governor General of",
    "first Governor General",
    "Governor of Gothenburg",
    "Governor of Meghalaya",
    "Governor of Wisconsin",
    "Governor of Oyo State",
    "Governor of Karnataka",
    "Governor of Rajasthan",
    "Governor of Minnesota",
    "Governor of Jharkhand",
    "Governor of São Paulo",
    "Governor of Carinthia",
    "Governor of Michoacán",
    "Governor of Zacatecas",
    "Governor of Maryland",
    "Governor of Nebraska",
    "Governor of Guerrero",
    "Governor of Coahuila",
    "Governor of Maranhão",
    "Governor of Illinois",
    "Governor of Santa Fe",
    "Governor of Delaware",
    "Governor of Missouri",
    "Governor of Michigan",
    "Governor of Arkansas",
    "Governor of Oklahoma",
    "Governor of Rondônia",
    "Governor of Guaviare",
    "Governor of Campeche",
    "Governor of San Juan",
    "Governor of Baja Sur",
    "Governor of Victoria",
    "Governor of Virginia",
    "Governor of Colorado",
    "Governor of Arizona",
    "Governor of Indiana",
    "Lieutenant Governor",
    "Governor of Gujarat",
    "Governor of Montana",
    "Governor of Vermont",
    "Governor of Jalisco",
    "Governor of Morelos",
    "Governor of Alabama",
    "Governor of Roraima",
    "Governor of Haryana",
    "Governor of Jiangsu",
    "Governor of Bayelsa",
    "Governor of Córdoba",
    "Governor of Antwerp",
    "Governor of Antique",
    "Governor of Yucatán",
    "Governor of Shaanxi",
    "Governor General of",
    "Governor of Guárico",
    "Governor of Sichuan",
    "Governor of Caqueta",
    "Governor of Wyoming",
    "Governor of Gotland",
    "Governor of Paraíba",
    "Governor of Omaheke",
    "Governor of Hidalgo",
    "Governor of Sergipe",
    "Governor of Nevada",
    "Governor of Kerala",
    "Governor of Sikkim",
    "Governor of Oregon",
    "Governor of Sokoto",
    "Governor of Kirkuk",
    "Governor of Puebla",
    "Governor of Itapúa",
    "Governor of Chubut",
    "Governor of Punjab",
    "Governor of Colima",
    "Governor of Paraná",
    "Governor of Cavite",
    "Governor of Kansas",
    "Governor of Odisha",
    "Governor of Riyadh",
    "Governor of Assam",
    "Governor of Bihar",
    "Governor of Idaho",
    "Governor of Jambi",
    "Governor of Samar",
    "Governor of State",
    "Governor of Sindh",
    "Governor of Bahia",
    "Governor of Hubei",
    "Governor of Kwara",
    "Governor of Henan",
    "Governor of Piauí",
    "Governor of Mecca",
    "Governor of Macau",
    "Governor of Maine",
    "Governor of South",
    "Governor General",
    "Governor of Sulu",
    "Governor of Bank",
    "Governor of Iowa",
    "Governor of Nadu",
    "Governor of Baja",
    "Governor of Utah",
    "Governor of Pará",
    "Governor of the",
    "Governor of New",
    "Deputy Governor",
    "th Governor of",
]

politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "deputy editor of",
    "director of education at the Royal Shakespeare Company",
    "artistic director of the Ballet Theatre of Queensland",
    "managing director of the Broadcasting Corporation",
    "director of the Indianapolis Symphony Orchestra",
    "director of photography for National Geographic",
    "director of the Museo Nacional de Bellas Artes",
    "general director of the Hager Fikir Theatre",
    "director of music at Choir of King College",
    "musical director of Bangarra Dance Theatre",
    "director of the Teatro dell'Opera di Roma",
    "general director of the Royal Opera House",
    "director of the Art Gallery of New South",
    "Emmy Award winning documentary director",
    "music director of the Republican Guard",
    "director of the Iowa Writers' Workshop",
    "director of the Choir of King College",
    'known as a "B" movie action director',
    "director of Staatstheater Stuttgart",
    "director of the Portrait Gallery",
    "director of the art exhibition",
    "Opera Company general director",
    "director of ABC News",
    "first black Broadway director",
    "director of the Tate Gallery",
    "documentary film director",
    "director of Film Festival",
    "artistic director of the",
    "director of Balenciaga",
    "editorial director of",
    "photography director",
    "television director",
    "lighting director",
    "artistic director",
    "and film director",
    "theatre director",
    "theater director",
    "dubbing director",
    "gallery director",
    "stage director",
    "music director",
    "film director",
    "band director",
    "art director",
    "voice of Deputy Dawg",
    "mainly of film music associated with Rainer Werner Fassbinder",
    "one of earliest music groups to record for Sam Phillips",
    "musical director of Bangarra Dance Theatre",
    "director of music at Choir of King College",
    "music director of the Republican Guard",
    "pioneer of bossa nova music style",
    "scored music for more than films",
    "was in many Broadway musicals",
    "pioneer of electronic music",
    "and hip hop music pioneer",
    "classical music critic",
    "spectral music pioneer",
    "musical arranger",
    "musical theatre",
    "music publisher",
    "music educator",
    "music promoter",
    "music arranger",
    "music director",
    "music manager",
    "music critic",
    "music editor",
    "musician",
    "Laurence Harvey Found dead in her bathtub of an",
    "conductor of the Regensburger Domspatzen",
    "chief conductor of the Philharmonic",
    "choir conductor",
    "conductor",
    "first president of ESPN",
    "editor of Journal of Recreational Mathematics",
    "general editor of the Victoria County History",
    "editor in chief of Dictionnaires Le Robert",
    "editor of the Press Association",
    "managing editor of William F Buckley",
    "editor in chief of Random House",
    "editor in chief of East Berlin",
    "editor of the Adventist Review",
    "founding sports editor of the",
    "former editor of Ms Magazine",
    "editor of magazine edition",
    "scientific journal editor",
    "former managing editor of",
    "Oxford Dictionary editor",
    "editor in chief of CSFR",
    "former obituary editor",
    "science journal editor",
    "editorial director of",
    "former editor of the",
    "royal editor of the",
    "editor in chief of",
    "founding editor of",
    "editor of magazine",
    "dictionary editor",
    "newspaper editor",
    "deputy editor of",
    "literary editor",
    "magazine editor",
    "editor in chief",
    "fashion editor",
    "editor of the",
    "music editor",
    "photo editor",
    "news editor",
    "film editor",
    "ITN editor",
    "editor of",
    "editor",
    'appeared as the original "fat boy" in the comedies from to',
    "starred in the s comedy series",
    "founder of political comedy group Capitol Steps",
    "member of comedy group HaGashash HaHiver",
    "wrote for comedians: Clive Anderson",
    "burlesque comedian",
    "comedian",
    "co founder leader of Academy of St Martin in the Fields",
    "winner of an Academy Award for Lifetime Achievement",
    "President of the Academy of Television Arts",
    "co founder of Academy Chicago Publishers",
    "won Academy Award for Best Story for in",
    "founder of the Kerala Cartoon Academy",
    "member of the Academy of Letters",
    "three time Academy Award winner",
    "member of the Academy",
    "Academy Award winner",
    "general director of the Royal Opera House",
    "founder of Tyndale House Publishers",
    "former drummer of Crowded House",
    "editor in chief of Random House",
    "Leader of the House of Commons",
    "member of the House of Lords",
    "artistic director of the Ballet Theatre of Queensland",
    "artistic director of the",
    "environmental artist",
    "prison escape artist",
    "sound effects artist",
    "performance artist",
    "storyboard artist",
    "voice over artist",
    "comic book artist",
    "artistic director",
    "recording artist",
    "landscape artist",
    "graphic artist",
    "cabaret artist",
    "make up artist",
    "concept artist",
    "visual artist",
    "makeup artist",
    "voice artist",
    "glass artist",
    "sound artist",
    "bard artist",
    "bead artist",
    "husband of actress Anjelica Huston",
    "former adult film actress",
    "voice actress",
    "actress",
    "mainly of film music associated with Rainer Werner Fassbinder",
    "first Miss Moneypenny in the James Bond film series",
    "documented the Topaz internment camp in his film",
    "considered the father of the film industry",
    "best known for Godzilla film series",
    "portrayed by Gooding Jr in the film",
    "first female filmmaker from Odisha",
    "leading lady in s B Western films",
    "known for her roles in the films",
    "scored music for more than films",
    "appeared in more than films",
    "played Q in the film series",
    "former adult film actress",
    "documentary film director",
    "inspiration for the film",
    "consultant on the film",
    "documentary film maker",
    "film poster designer",
    "inspiration for film",
    "adult film performer",
    "and film director",
    "horror film host",
    "and film critic",
    "film director",
    "film theorist",
    "film critic",
    "film editor",
    "film maker",
    "filmmaker",
    "and film",
    "duettist with pianist Renée Morisset",
    "concert pianist",
    "jazz pianist",
    "piano player",
    "pianist",
    "known for his moody paintings of",
    "painted cover for",
    "painter",
    "accompanist arranger for Noël Coward",
    "Grammy Award winning arranger",
    "musical arranger",
    "music arranger",
    "jazz arranger",
    "arranger",
    "essayist",
    "sculptor",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "co founder of chess boxing",
    "coach",
    "five time winner of the Safari Rally & president of the FIA World Rally Championship commission",
    "president of the National Association of Professional Baseball Leagues",
    "president of the International Baseball Federation",
    "president of the Ladies' Football Association",
    "president of the International Ski Federation",
    "president of K S C Lokeren Oost Vlaanderen",
    "president of the World Squash Federation",
    "president of the Boxing Board of Control",
    "president of the National Hockey League",
    "president of the Philadelphia Phillies",
    "president of the Athletic Association",
    "president of the Handball Federation",
    "president of the Cycling Federation",
    "president of ASM Clermont Auvergne",
    "three time president of the NWA",
    "president of the Ski Federation",
    "president of the Cricket Union",
    "president of the Chicago Bears",
    "president of Galatasaray S K",
    "president of the Nordiques",
    "president of Galatasaray",
    "president of Real Madrid",
    "former Carlton president",
    "president of the USTA",
    "and president of NAIA",
    "president of the USGA",
    "president of the FIA",
    "president of the DFB",
    "president of CAHA",
    "president of UEFA",
    "president of FIBA",
    "and martial artist",
    "paralympian gold medallist",
    "seven time Paralympian",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "inventor of the superjunction power semiconductor device",
    "member of the Academy of Engineering",
    "member of Academy of Engineering",
    "leading research on hospital acquired infections",
    "professor of International Health",
    "clinical professor of psychiatry",
    "professor of health education",
    "professor of meteorology",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "assistant professor in anthropology at Vanderbilt University",
    "cultural anthropology",
    "President of the Memorial Sloan Kettering Cancer Center",
    "President of the Massachusetts Institute of Technology",
    "President of the State University of at Plattsburgh",
    "President of the Rochester Institute of Technology",
    "President of John Jay College of Criminal Justice",
    "President of Rose Hulman Institute of Technology",
    "President of the University of Wisconsin System",
    "President of the Virginia Polytechnic Institute",
    "President of Xi'an Foreign Languages Institute",
    "President of the University of North Carolina",
    "President of the University of New Hampshire",
    "President of Saint Mary of the Woods College",
    "President of the University of Alaska system",
    "President of the University of at Arlington",
    "President of the University of at Lafayette",
    "President of Fairleigh Dickinson University",
    "President of University of Missouri System",
    "President of the College of William & Mary",
    "President of the University of Connecticut",
    "President of National Tsing Hua University",
    "President of the College of the Holy Cross",
    "President of Nova Southeastern University",
    "President of the University of at El Paso",
    "President of George Washington University",
    "President of the University of at Austin",
    "President of Washington State University",
    "President of Oklahoma Baptist University",
    "President of the University of Minnesota",
    "President of University of San Francisco",
    "President of the University of Scranton",
    "President of the University of Richmond",
    "President of History of Science Society",
    "President of Carnegie Mellon University",
    "President of Tennessee State University",
    "President of The Catholic University of",
    "President of the University of Montana",
    "President of the University at Buffalo",
    "President of the Latin Jewish Congress",
    "President of the University of Wyoming",
    "President of the University of Arizona",
    "President of the University of Central",
    "President of Arizona State University",
    "President of Emporia State University",
    "President of Laredo Community College",
    "President of University of Lethbridge",
    "President of Johns Hopkins University",
    "Western Michigan University President",
    "President of the University of Hawaii",
    "President of University of at Austin",
    "President of the Academy of Sciences",
    "President of Jacksonville University",
    "President of the Library Association",
    "President of Oregon State University",
    "President of the University of Mary",
    "President of the Amon Carter Museum",
    "President of Far Eastern University",
    "President of the Max Planck Society",
    "President of the University of Utah",
    "President of Ball State University",
    "President of Pepperdine University",
    "President of the Institute of Arts",
    "President of Georgetown University",
    "President of Princeton University",
    "President of the State University",
    "President of Creighton University",
    "President of Nevada State College",
    "President of Dalhousie University",
    "President of New State University",
    "President of Stanford University",
    "President of Duquesne University",
    "President of Marshall University",
    "President of Lincoln University",
    "President of St Francis College",
    "President of Hofstra University",
    "President of Indiana University",
    "President of Rutgers University",
    "President of Adelphi University",
    "President of Middlebury College",
    "President of Temple University",
    "President of the University of",
    "President of Auburn University",
    "President of Excelsior College",
    "President of Purdue University",
    "President of Fuzhou University",
    "President of Boston University",
    "President of Asbury University",
    "President of Lamar University",
    "President of Canisius College",
    "President of Teachers College",
    "President of Le Moyne College",
    "President of Rowan University",
    "President of Duke University",
    "President of Elon University",
    "President of Oberlin College",
    "President of Rice University",
    "President of Ursinus College",
    "President of Seneca College",
    "President of Boston College",
    "President of Calvin College",
    "President of Rhodes College",
    "President of Virginia Tech",
    "President of Smith College",
    "President of the Academy",
    "President of The Citadel",
    "President of University",
    "President of Penn",
    "President of MIT",
    "President of MSU",
    "director of the Juneau Memorial Library",
    "Folger Shakespeare Library program director",
    "director general of the Bangla Academy",
    "director of the Odesa Fine Arts Museum",
    "director of the National Gallery of",
    "director of Conservatoire Libanais",
    "director of the Museum of Arts",
    "director of the Ailey School",
    "director of the Louvre",
    "ethnomusicologist",
    "musicologist",
    "translator for her husband Ole Nydahl",
    "literary translator",
    "translator",
    "educator",
    "Director General of the Shilpakala Academy",
    "director general of the Bangla Academy",
    "member of the Language Academy",
    "assistant professor in anthropology at Vanderbilt University",
    "professor at Conservatoire de musique du Québec à Montréal",
    "first professor of history at the Open University",
    "professor of history at West Virginia University",
    "professor of mathematics at Stanford University",
    "emeritus professor at Northeastern University",
    "professor emeritus of electrical engineering",
    "professor of planning at Reading University",
    "professor at the University of Groningen",
    "violin professor at Yale University",
    "professor at University of Sussex",
    "professor of contextual theology",
    "professor of administrative law",
    "professor at Rutgers University",
    "emeritus professor of politics",
    "professor at Duke University",
    "Cornell University professor",
    "professor of ethnic studies",
    "a professor of medicine",
    "professor of literature",
    "professor of philosophy",
    "professor at Law School",
    "world oldest professor",
    "engineering professor",
    "professor at Sorbonne",
    "university professor",
    "literature professor",
    "professor of studies",
    "professor of social",
    "professor emeritus",
    "college professor",
    "Oxford professor",
    "art professor",
    "law professor",
    "BYU professor",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "Deputy Secretary of Defense",
    "Deputy Director of the CIA",
    "Deputy Director of the FBI",
    "Sergeant at Arms for Legislative Assembly of Nova Scotia",
    "president of Beşiktaş J K",
    "youngest sailor awarded a Medal of Honor in World War II",
    "formerly the oldest living Medal of Honor recipient",
    "youngest marine to be awarded the Medal of Honor",
    "Medal of Honor winning soldier in World War II",
    "oldest living Medal of Honor recipient",
    "World War II Medal of Honor recipient",
    "disputed claimant for Medal of Honor",
    "Medal of Honor recipient in the War",
    "posthumous Medal of Honor recipient",
    "War recipient of the Medal of Honor",
    "awarded Legislative Medal of Honor",
    "recipient of the Medal of Honor",
    "recipient of Medal of Honor",
    "awarded the Medal of Honor",
    "Medal of Honor recipient",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "co author of",
    "general director of the Evangelical Alliance",
    "Deputy General Secretary of UMCOR",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = ["philanthropist charity worker", "philanthropist"]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "complications related to diabetes",
    "diabetes related complications",
    "complications due to diabetes",
    "complications from diabetes",
    "diabetes complications",
    "diabetes",
    "homicide by neck compression",
    "homicide by prohibited drug",
    "homicide by asphyxiation",
    "homicide by poison arrow",
    "homicide by suffocation",
    "homicide by stabbing",
    "homicide by overdose",
    "vehicular homicide",
    "overdose of Fentanyl painkillers",
    "complications from a drug overdose",
    "seizure following drug overdose",
    "prescription drug overdose",
    "muscle relaxant overdose",
    "suspected drug overdose",
    "apparent drug overdose",
    "Pentobarbital overdose",
    "nitrous oxide overdose",
    "homicide by overdose",
    "overdose of alcohol",
    "suspected overdose",
    "fentanyl overdose",
    "drugs overdose []",
    "sedative overdose",
    "cocaine overdose",
    "codeine overdose",
    "heroin overdose",
    "opiate overdose",
    "opiod overdose",
    "drugs overdose",
    "drug overdose",
    "overdose",
    "chronic lymphocytic leukaemia",
    "acute erythroid leukaemia",
    "acute myeloid leukaemia",
    "leukaemia",
    "complications from a heart condition",
    "complications from a heart procedure",
    "complications from a heart aneurysm",
    "complications from heart problems",
    "pre existing heart condition",
    "heart related complications",
    "undiagnosed heart condition",
    "complications from heart",
    "Congenital heart defect",
    "chronic heart condition",
    "complications of heart",
    "heart irregularities",
    "heart valve stenosis",
    "heart complications",
    "after heart trouble",
    "burst heart valve",
    "heart arrhythmia",
    "heart condition",
    "heart infection",
    "heart problems",
    "enlarged heart",
    "heart disorder",
    "heart ailment",
    "heart problem",
    "heart",
    "infections as a complication from rheumatoid arthritis",
    "complications from staphylococcal infection",
    "complications from urinary tract infection",
    "complications from an intestinal infection",
    "complications of a bacterial infection",
    "complications from a kidney infection",
    "infection from bone marrow transplant",
    "complications from a blood infection",
    "complications from a staph infection",
    "complications from a chest infection",
    "complications from a lung infection",
    "complications of a viral infection",
    "infection from a perforated colon",
    "complications from skin infection",
    "complications of lung infections",
    "complications from an infection",
    "complications of hiv infection",
    "complications of an infection",
    "complications from infection",
    "upper respiratory infection",
    "hospital acquired infection",
    "complications of infection",
    "meningitis staph infection",
    "staphylococcus infection",
    "bacterial lung infection",
    "urinary tract infection",
    "streptococcal infection",
    "post surgical infection",
    "acute kidney infection",
    "respiratory infection",
    "generalized infection",
    "esophageal infection",
    "intestinal infection",
    "bacterial infection",
    "abdominal infection",
    "prostate infection",
    "shingles infection",
    "shoulder infection",
    "urinary infection",
    "stomach infection",
    "bladder infection",
    "kidney infection",
    "chest infection",
    "blood infection",
    "heart infection",
    "staph infection",
    "viral infection",
    "colon infection",
    "lung infection",
    "foot infection",
    "MRSA infection",
    "complications from chronic traumatic encephalopathy",
    "complications from injuries sustained in a beating",
    "complications following yellow fever vaccination",
    "complications from takotsubo cardiomyopathy and",
    "complications from reflex sympathetic dystrophy",
    "complications from frontotemporal degeneration",
    "complications from an injury sustained at home",
    "complications from ruptured abdominal aneurysm",
    "complications from primary progressive aphasia",
    "complications following gallbladder operation",
    "complications from a blood clotting disorder",
    "complications during heart valve replacement",
    "complications from myelodysplastic syndrome",
    "complications from gallbladder inflammation",
    "complications from chronic fatigue syndrome",
    "complications from staphylococcal infection",
    "complications from chronic substance abuse",
    "complications from urinary tract infection",
    "complications from an intestinal infection",
    "complications following a heart transplant",
    "complications from a bone marrow disorder",
    "complications of cyclic vomiting syndrome",
    "complications of adenoid cystic carcinoma",
    "complications following a lung transplant",
    "complications of a bone marrow transplant",
    "complications from infected gall bladder",
    "complications of multiple system atrophy",
    "complications from a digestive disorder",
    "complications from intestinal occlusion",
    "complications from severe burn injuries",
    "complications following a hip operation",
    "complications from rheumatoid arthritis",
    "complications from a kidney transplant",
    "complications from a broken thigh bone",
    "complications of liver transplantation",
    "complications from high blood pressure",
    "complications following leg amputation",
    "complications of a bacterial infection",
    "complications from a kidney infection",
    "complications from a liver transplant",
    "complications from chronic alcoholism",
    "complications from Lewy Body Dementia",
    "complications from an aortic aneurysm",
    "complications of rheumatoid arthritis",
    "complications from Parkinson Disease",
    "complications from a heart condition",
    "complications from a blood infection",
    "complications from polycythemia vera",
    "complications from a staph infection",
    "complications from internal bleeding",
    "complications after colon operations",
    "complications following a broken hip",
    "complications from kidney transplant",
    "complications from a chest infection",
    "complications of a kidney transplant",
    "complications of Shy Drager syndrome",
    "complications from a heart procedure",
    "complications from a lung transplant",
    "complications from a blood disorder",
    "complications from kidney treatment",
    "complications from a heart aneurysm",
    "complications from a ruptured ulcer",
    "complications from perforated ulcer",
    "complications from a lung infection",
    "complications from liver transplant",
    "complications from a lung condition",
    "complications from anorexia nervosa",
    "complications from declining health",
    "complications from West Nile virus",
    "complications of a heart condition",
    "complications of a low blood count",
    "complications of a viral infection",
    "complications from substance abuse",
    "complications from a spinal injury",
    "complications of myasthenia gravis",
    "complications from a drug overdose",
    "complications from bulimia nervosa",
    "complications of Parkinson Disease",
    "complications from cystic fibrosis",
    "complications from hip replacement",
    "complications from digestive virus",
    "complications from necrotic sepsis",
    "complications from a stomach ulcer",
    "complications from kidney ailment",
    "complications of ischemic colitis",
    "complications from arson injuries",
    "complications from bronchiectasis",
    "complications from skin infection",
    "complications from food poisoning",
    "complications from a broken femur",
    "complications of anorexia nervosa",
    "complications from heart problems",
    "complications related to diabetes",
    "complications of a kidney ailment",
    "complications from stomach ulcers",
    "complications following operation",
    "complications from a hip fracture",
    "complications from a head injury",
    "complications from hydrocephalus",
    "complications from a broken neck",
    "complications from crushed torso",
    "complications from kidney stones",
    "complications after hip fracture",
    "complications of lung infections",
    "complications of a lung disorder",
    "complications from a broken hip",
    "complications from hypertension",
    "complications from an infection",
    "complications from septic shock",
    "cardiorespiratory complications",
    "complications from hip fracture",
    "complications from endocarditis",
    "complications from mesothelioma",
    "complications from glioblastoma",
    "complications from pancreatitis",
    "kidney transplant complications",
    "complications following robbery",
    "complications from pneumothorax",
    "complications from liver damage",
    "complications of a lung ailment",
    "complications from appendicitis",
    "complications of diverticulitis",
    "complications of a heart defect",
    "complications from appendectomy",
    "complications of lung condition",
    "complications from a wasp sting",
    "complications from leg sarcoma",
    "complications from amyloidosis",
    "complications from hepatitis C",
    "complications from head trauma",
    "complications from sarcoidosis",
    "complications of stomach ulcer",
    "complications from Hepatitis C",
    "complications from an aneurysm",
    "complications from liposuction",
    "complications from angioplasty",
    "complications from anaphylaxis",
    "complications of hiv infection",
    "complications from burn injury",
    "complications of poliomyelitis",
    "diabetes related complications",
    "complications from meningitis",
    "complications of glioblastoma",
    "complications from childbirth",
    "complications from bronchitis",
    "complications due to diabetes",
    "complications from drug abuse",
    "complications of an infection",
    "complications of quadriplegia",
    "complications related to lung",
    "complications from alcoholism",
    "complications of mesothelioma",
    "complications of HN influenza",
    "complications from influenza",
    "complications from paralysis",
    "complications from a seizure",
    "complications from pregnancy",
    "complications of sarcoidosis",
    "complications from cirrhosis",
    "complications from laminitis",
    "complications from infection",
    "complications of blood clots",
    "complications from diabetes",
    "complications of broken hip",
    "complications from epilepsy",
    "complications from gangrene",
    "complications of childbirth",
    "complications from an ulcer",
    "complications from jaundice",
    "complications in childbirth",
    "complications of alcoholism",
    "heart related complications",
    "complications of influenza",
    "complications of infection",
    "complications of hepatitis",
    "complications from foaling",
    "complications from anaemia",
    "quadriplegic complications",
    "complications from malaria",
    "complications from related",
    "complications from chronic",
    "post foaling complications",
    "respiratory complications",
    "complications from sepsis",
    "complications from asthma",
    "sleep apnea complications",
    "complications of shooting",
    "complications from anemia",
    "complications from kidney",
    "complications from colic",
    "complications from heart",
    "intestinal complications",
    "complications of old age",
    "complications from liver",
    "complications of malaria",
    "complications from lupus",
    "complications from polio",
    "complications from AIDS",
    "complications from COPD",
    "influenza complications",
    "complications of astha",
    "surgical complications",
    "complications of heart",
    "complications of colic",
    "complications of liver",
    "complications from ALS",
    "complications of burns",
    "complications from HIV",
    "diabetes complications",
    "related complications",
    "foaling complications",
    "complications of AIDS",
    "old age complications",
    "stomach complications",
    "complications of lung",
    "kidney complications",
    "heart complications",
    "liver complications",
    "blood complications",
    "complications from",
    "lung complications",
    "AIDS complications",
    "complications of",
    "alcohol related cirrhosis",
    "cirrhosis of the liver",
    "hepatic cirrhosis",
    "liver cirrhosis",
    "cirrhosis",
    "urosepsis",
    "sepsis",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1029]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [1030]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 23393 values in cause_of_death column.

CPU times: total: 5min 19s
Wall time: 5min 19s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [1031]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    87196
2    10576
3      238
0       31
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- On that iteration, we observed that some `cause_of_death` values may be continued into the next column, such as for "heart and lung disease".  When we address the subsequent column, we will need to keep that in mind.
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

In [1032]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [None]:
# Obtaining values for column and their counts
roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_cause_list.pop()

In [None]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "general" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [642]:
# df.loc[
#     [
#         index
#         for index in df[df["info_3_0"].notna()].index
#         if "member of the National Assembly" in df.loc[index, "info_3_0"]
#         and df.loc[index, "politics_govt_law"] == 0
#     ],
#     :,
# ]

<IPython.core.display.Javascript object>

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [None]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

#### Checking Updated `num_categories` Value Counts

In [None]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean6.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean6.db")
# df.to_sql("wp_life_expect_clean", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part ]()