# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 6](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean6_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_clean5.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean5.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean5", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98041 rows and 44 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,ballet designer,director,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,writer,,,and academic,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98039,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",50.0,,Pakistan,,"2002 2007, since 2018",4.60517,MNA,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,1,0,0,0,2
98040,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,member of the Academy of Engineering,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
34675,22,Bob Gould,", 74, Australian activist and bookseller.",https://en.wikipedia.org/wiki/Bob_Gould_(activist),15,2011,May,,74.0,,Australia,,,2.772589,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,1,0,0,0,2
74873,20,Mira Zakai,", 76, Israeli opera singer, complications from a stroke.",https://en.wikipedia.org/wiki/Mira_Zakai,7,2019,May,,76.0,,Israel,,,2.079442,complications from a stroke,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
66363,2,Mundell Lowe,", 95, American jazz guitarist and composer.",https://en.wikipedia.org/wiki/Mundell_Lowe,6,2017,December,,95.0,,United States of America,,,1.94591,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
55769,26,Jerrold Kemp,", 94, American academic.",https://en.wikipedia.org/wiki/Jerrold_Kemp,12,2015,November,,94.0,,United States of America,,,2.564949,,,,,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,0,0,1
82010,2,Jacques Noyer,", 93, French Roman Catholic prelate, Bishop of Amiens .",https://en.wikipedia.org/wiki/Jacques_Noyer,5,2020,June,,93.0,,France,Italy,1987 2003,1.791759,Bishop of Amiens,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98041 entries, 0 to 98040
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98041 non-null  object 
 1   name                       98041 non-null  object 
 2   info                       98041 non-null  object 
 3   link                       98041 non-null  object 
 4   num_references             98041 non-null  int64  
 5   year                       98041 non-null  int64  
 6   month                      98041 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98041 non-null  float64
 9   cause_of_death             17 non-null     object 
 10  place_1                    97888 non-null  object 
 11  place_2                    8115 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98041 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict` and starting the search of `info_3` columns.
- We will need to adjust our approach at this step, in order to also capture `cause_of_death` values.
- Prior to this point we have hard-coded the few `cause_of_death` values encountered, but we expect a much higher proportion of them for the remaining numbered columns.
- We will add a new list `cause_of_death` to collect those values and we will add a separate loop to extract them to the `cause_of_death` column.

### Extracting Remaining `known_for` and `cause_of_death` Values

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [6]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "shot" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "shot Eddie Waitkus"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [11]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "campaigner for breast cancer drug Herceptin",
    "MPP of the Legislative Assembly of for Beaches—Woodbine",
    "MP for Secretary General of the Council of and",
    "MPP of the Legislature for Waterloo North",
    "MP of the Karnataka Legislative Assembly",
    "MP of the Odisha Legislative Assembly",
    "MP for Medicine Hat—Cardston—Warner",
    "MP for Newcastle upon Tyne Central",
    "MP for Newcastle upon Tyne North",
    "MP for Perth—Wellington—Waterloo",
    "MP for North West Leicestershire",
    "MP for Northumberland—Miramichi",
    "MP for Wolverhampton South East",
    "MP for Newcastle upon Tyne East",
    "MP for South East St Elizabeth",
    "MP for North Central Clarendon",
    "MP for Manchester Wythenshawe",
    "first honorary MPLA President",
    "MP for Movement of the People",
    "MP for Beauharnois—Salaberry",
    "MP for Birmingham Perry Barr",
    "MP for Birmingham Handsworth",
    "MP for Birmingham Hall Green",
    "MP for Scarborough—Agincourt",
    "MP for Northumberland—Durham",
    "MP for Battleford—Kindersley",
    "MP for Birmingham Northfield",
    "MP for Sheffield Brightside",
    "MP for Leicester North West",
    "MP for City of Buenos Aires",
    "Queensland MP for Mackenzie",
    "MP for Amman fifth district",
    "MP for Stoke on Trent South",
    "MP for Leicester South East",
    "MP for Manchester Blackley",
    "MP for Windsor—Walkerville",
    "MP for Carleton Gloucester",
    "MP for Manchester Openshaw",
    "MP for Coventry North East",
    "MP for Croydon North West",
    "MP for East Aberdeenshire",
    "MP for Vancouver Kingsway",
    "MP for Birmingham Yardley",
    "MP for Nottingham Central",
    "MP for Afram Plains North",
    "MP for Bristol North West",
    "MP for Wandsworth Central",
    "MP for West Aberdeenshire",
    "MP for Manchester Ardwick",
    "MP for Manchester Central",
    "Conservative MP for South",
    "MP for Glasgow Queen Park",
    "MP for Weston super Mare",
    "MP for East Renfrewshire",
    "MP for Ambunti Dreikikir",
    "MP for Barrow in Furness",
    "MP for Mission—Coquitlam",
    "federal MP for St George",
    "MP for Belfast Shankill",
    "MPP for Dufferin Simcoe",
    "MPP for Wentworth North",
    "MP for Sheffield Heeley",
    "MP for Glasgow Cathcart",
    "MP for Clwyd North West",
    "co developer of the IMP",
    "MP for Paddington North",
    "MP for Bouches du Rhône",
    "MP for Grenville—Dundas",
    "MP for Bournemouth East",
    "MP for Stockholm County",
    "MP for Southampton West",
    "MP for Glenrothes since",
    "MP for Mid Bedfordshire",
    "MP for Brome—Missisquoi",
    "former MPP for Kingston",
    "MP for South Manchester",
    "MP for Sheffield Hallam",
    "MP for Naogaon District",
    "MP for Hemel Hempstead",
    "Khyber Pakhtunkhwa MPA",
    "MP for South West Nova",
    "MP for Bury St Edmunds",
    "MP for Bishop Auckland",
    "MP for Nottingham West",
    "MP for Ottawa—Carleton",
    "MP for Montgomeryshire",
    "MP for Coimbatore East",
    "MP for Kingsford Smith",
    "MP for The Battlefords",
    "former Labour Party MP",
    "MP for Blackpool South",
    "MP for Leeds—Grenville",
    "MP for Sault Ste Marie",
    "MP for Central Honiara",
    "NSW MP for Burrinjuck",
    "MP for Wellingborough",
    "MP for City of Durham",
    "MP for North Cornwall",
    "MP for Sarnia—Lambton",
    "MPP for Niagara Falls",
    "MP for Hatay Province",
    "MP for Aberdeen South",
    "MP for Knowsley South",
    "MPP of from Cambridge",
    "MP for Stockton North",
    "MP for Winnipeg South",
    "MP for Dalarna County",
    "MP for Brandon—Souris",
    "MP for Glasgow Pollok",
    "MPP for Ottawa Rideau",
    "speaker of MPR DPR in",
    "MP for Uppsala County",
    "MP for Berettyóújfalu",
    "MP for Glasgow Provan",
    "MP for Ikaroa Rāwhiti",
    "MP for Bradford West",
    "MP for Hamilton West",
    "Labour Party list MP",
    "MP for Otago Central",
    "MP for Ottawa—Vanier",
    "MP for Dunedin North",
    "MP for Pas de Calais",
    "MP for Kajiado North",
    "MP for Lewisham West",
    "Labour MP for Newark",
    "MP for Edmonton East",
    "MP for Middlesbrough",
    "MP for Glasgow Govan",
    "MP for North Malaita",
    "MP for Chennai North",
    "MPP for Durham West",
    "MP for South Antrim",
    "MP for Vaipae Tautu",
    "MP for Swansea West",
    "MP for Newport West",
    "MP for North Sydney",
    "NSW MP for Corrimal",
    "MP for Invercargill",
    "MP for Western Hutt",
    "MP for West Lothian",
    "MP for Cardiff West",
    "MP for North Imenti",
    "MPP for Essex South",
    "MP for Rowley Regis",
    "MP for Gelang Patah",
    "MP for Guruve South",
    "MP for Gainsborough",
    "MP for Basingstoke",
    "MP for Cooch Behar",
    "MP for North Shore",
    "MP for Eden Monaro",
    "MP for Oldham West",
    "MP for West Dorset",
    "MP for Temotu Pele",
    "MP for Maharashtra",
    "head of the UOC MP",
    "MP for Capricornia",
    "MP from Saint John",
    "MP for Londonderry",
    "MP for Regina East",
    "MP for West Tyrone",
    "MP for Mount Royal",
    "MP for Harrow West",
    "MP for Grey—Simcoe",
    "MP for Port Arthur",
    "MP for Bexleyheath",
    "MP for Marijampolė",
    "MP for Fredericton",
    "MP for Hull—Aylmer",
    "MP for Wythenshawe",
    "MP for Clackmannan",
    "President of AMPAS",
    "MP for North Devon",
    "MP for Essex South",
    "MP for Banaskantha",
    "MP for Billericay",
    "MP for Guanajuato",
    "MP for Banffshire",
    "MP for Eastbourne",
    "MP for West Derby",
    "MP for Heretaunga",
    "Northern Cape MPL",
    "MP for Repentigny",
    "MP for Bromsgrove",
    "MP for Accrington",
    "MP for Island Bay",
    "MP for Clydesdale",
    "MP for The Wrekin",
    "MP for Vijayawada",
    "MP for Leominster",
    "MP for Kilmarnock",
    "MP for Hull North",
    "MP for Carshalton",
    "MP for Midlothian",
    "MP for York North",
    "MP for Rushcliffe",
    "MPP for Bellwoods",
    "MP for South Down",
    "MP for Mahasamund",
    "MP for Deggendorf",
    "MP for Buckingham",
    "MP for Coimbatore",
    "MP for Nuevo León",
    "MP for Charlevoix",
    "MP for Srikakulam",
    "MP for Pontefract",
    "MP for Winchester",
    "MPP for York East",
    "MP for Brentford",
    "MP for Wairarapa",
    "MP for Lyttelton",
    "MP for Faversham",
    "MP for Warringah",
    "MP for Cambridge",
    "MP for Orpington",
    "MP for Rochester",
    "MP for Frontenac",
    "MP for Hampstead",
    "MP for Szigetvár",
    "MP for Tongariro",
    "MP for Churchill",
    "MP for Easington",
    "MP for Kaohsiung",
    "MP for Brighouse",
    "MP for Greenwich",
    "MP for Tongatapu",
    "MP for Robertson",
    "MP for Waitakere",
    "MP for Pencarrow",
    "MP for Worcester",
    "MP for Stretford",
    "MP for Smethwick",
    "MP for Tottenham",
    "MP for Nagercoil",
    "MP for Nizamabad",
    "MP for Lancaster",
    "MP for Cuddalore",
    "MP for Ashburton",
    "MP for Wakefield",
    "MP for Penistone",
    "MP for Saarlouis",
    "MP for Mt Albert",
    "MP for Hastings",
    "MP for Richmond",
    "MP for Barnsley",
    "MP for Coahuila",
    "MP for La Trobe",
    "MP for Wide Bay",
    "MP for Falmouth",
    "MP for McMillan",
    "MP for Galloway",
    "MP for Värmland",
    "Balochistan MPA",
    "MP for Waterloo",
    "MP for Solihull",
    "MP for Hereford",
    "MP for Greenock",
    "MP for Hyndburn",
    "MP for El Koura",
    "MP for Palliser",
    "MP for Finchley",
    "MP for Tiverton",
    "MP for Sherwood",
    "MP for Bilaspur",
    "MP for Jelutong",
    "MP for Fallujah",
    "MP for Keighley",
    "MP for Chittoor",
    "MPP for Welland",
    "MP for Heywood",
    "MP for Dalarna",
    "MP for Denison",
    "MP for Badulla",
    "MP for Lasalle",
    "MP for Jalisco",
    "MP for Makueni",
    "MP for Bangaon",
    "MP for Western",
    "MP for Entally",
    "MP for Nyakach",
    "MP for Newbury",
    "MP for Taunton",
    "MP for Dum Dum",
    "MP for Feltham",
    "MP for Mitcham",
    "MP for Wallace",
    "MP of Northern",
    "independent MP",
    "MP for Blaydon",
    "MP for Trinity",
    "MP for Consett",
    "MP for Romford",
    "MP for Spadina",
    "MP for Mercier",
    "MP for Sudbury",
    "MP for Ipswich",
    "MP for Oxford",
    "Queensland MP",
    "MP for Jorhat",
    "MP for Ukonga",
    "MP for Hughes",
    "MP for Dungun",
    "MP for Rompin",
    "MPP for South",
    "MP for Dawson",
    "MP for Argyll",
    "three time MP",
    "MP for Halton",
    "MP of Gujarat",
    "MP for Kabete",
    "MP for Belper",
    "MP for Oaxaca",
    "MP for Melton",
    "MP for Boston",
    "MP for Conway",
    "MP for Mannar",
    "MP for Kigoma",
    "MP for Khulna",
    "MP for Araria",
    "MP for Batley",
    "MP for Leyton",
    "MP for Bowman",
    "MP for Kenema",
    "MP for Jarrow",
    "MP for Ngella",
    "MP for Maldon",
    "MP for Gwydir",
    "MP for Butere",
    "MP for Ndhiwa",
    "MP for Brecon",
    "MP for Fraser",
    "MP for Hunter",
    "MP for Heston",
    "MP for Athens",
    "MP for Viborg",
    "MP for Tumkur",
    "MP for Dudley",
    "MP for Kapiti",
    "MP for Howrah",
    "MP for Napier",
    "MP for Henty",
    "MP for Poole",
    "MP for Geita",
    "New South MP",
    "MP for Brant",
    "MP for Ranau",
    "MP for Waipa",
    "MP for Alwar",
    "MP for Acton",
    "MP for Perth",
    "MP for Truro",
    "MP for Royal",
    "MP for Udupi",
    "MP for Wells",
    "MP for Conwy",
    "MP for Elgin",
    "MP for Nketa",
    "MP for Monor",
    "five time MP",
    "MP for Swan",
    "MP for Swat",
    "MP of Sabah",
    "MP for Sibu",
    "Gauteng MPL",
    "MP for Raub",
    "MP for Hutt",
    "MP for Lowe",
    "Liberal MPP",
    "MP for Aska",
    "MP for York",
    "MP for Buem",
    "Punjab MPA",
    "MP for Bow",
    "MP for Ayr",
    "Sindh MPA",
    "List MP",
    "NSW MP",
    "MPP of",
    "MP for",
    "MPP",
    "MPA",
    "MPL",
    "MP",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "shot Andy Warhol Shot Marilyns paintings",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "and Olympic shot putter",
    "shot putter",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "known as one of the world leading authorities on cancer research",
    "co developer of ultrasound use in cancer detection",
    "treated herself for breast cancer on Antarctica in",
    "co discoverer of drugs that fight cancer",
    "pioneer in breast cancer treatment",
    "expert in breast cancer treatment",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "pediatric cancer advocate",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "shot Eddie Waitkus",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "stroke following decade long battle with breast cancer",
    "died during treatment for testicular cancer in",
    "kidney failure associated with colon cancer",
    "complications of treatment for lung cancer",
    "pneumonia as a complication of lung cancer",
    "meningitis complicated from breast cancer",
    "complications from kidney cancer surgery",
    "complications of breast cancer treatment",
    "complications of prostate cancer surgery",
    "complications from bowel cancer surgery",
    "complications from gall bladder cancer",
    "pancreatic cancer",
    "heart attack due to pancreatic cancer",
    "complications from bone marrow cancer",
    "pneumonia as a complication of cancer",
    "complication following cancer surgery",
    "complications from esophageal cancer",
    "complications from pancreatic cancer",
    "complications from colorectal cancer",
    "heart attack during cancer treatment",
    "complications from bile duct cancer",
    "complications from prostate cancer",
    "cardiac arrest due to colon cancer",
    "as a complication of breast cancer",
    "complications of colorectal cancer",
    "complications of pancreatic cancer",
    "complications from bladder cancer",
    "complications from stomach cancer",
    "complications from cancer surgery",
    "liver cancer complicated by COVID",
    "complications from throat cancer",
    "complications from breast cancer",
    "complications of prostate cancer",
    "kidney complications from cancer",
    "complications from liver cancer",
    "complications from colon cancer",
    "complications from brain cancer",
    "complications of ovarian cancer",
    "euthanised for abdominal cancer",
    "complications from lung cancer",
    "complications of breast cancer",
    "complications from skin cancer",
    "complications from oral cancer",
    "complications of brain cancer",
    "complications of liver cancer",
    "complications of colon cancer",
    "complications of lung cancer",
    "metastatic pancreatic cancer",
    "metastatic esophageal cancer",
    "throat cancer",
    "complications from cancer",
    "anaplastic thyroid cancer",
    "urothelial bladder cancer",
    "metastasized liver cancer",
    "metastatic breast cancer",
    "medullary thyroid cancer",
    "euthanized due to cancer",
    "cancer related pneumonia",
    "complications of cancer",
    "metastatic colon cancer",
    "gastrointestinal cancer",
    "small cell lung cancer",
    "small intestine cancer",
    "neuroendrocrine cancer",
    "cancer related illness",
    "salivary gland cancer",
    "neuroendocrine cancer",
    "hepatocellular cancer",
    "metastatic eye cancer",
    "cancerous peritonitis",
    "periampullary cancer",
    "spinal fluid cancer",
    "cancer of the spine",
    "renal pelvic cancer",
    "bone marrow cancer",
    "gallbladder cancer",
    "oesophageal cancer",
    "endometrial cancer",
    "male breast cancer",
    "nasopharynx cancer",
    "pancreatic cancer",
    "esophageal cancer",
    "colorectal cancer",
    "intestinal cancer",
    "testicular cancer",
    "peritoneal cancer",
    "pharyngeal cancer",
    "metastatic cancer",
    "mandibular cancer",
    "Leka Zogu; cancer",
    "bile duct cancer",
    "laryngeal cancer",
    "abdominal cancer",
    "lymphatic cancer",
    "esophagus cancer",
    "carcinoid cancer",
    "ampullary cancer",
    "prostate cancer",
    "cervical cancer",
    "appendix cancer",
    "pancreas cancer",
    "lymphoma cancer",
    "thoracic cancer",
    "urethral cancer",
    "stomach cancer",
    "ovarian cancer",
    "bladder cancer",
    "uterine cancer",
    "thyroid cancer",
    "adrenal cancer",
    "gastric cancer",
    "myeloma cancer",
    "vaginal cancer",
    "of lung cancer",
    "throat cancer",
    "kidney cancer",
    "spinal cancer",
    "tongue cancer",
    "tonsil cancer",
    "thymic cancer",
    "rectal cancer",
    "vulvar cancer",
    "thymus cancer",
    "pelvic cancer",
    "liver cancer",
    "brain cancer",
    "colon cancer",
    "bowel cancer",
    "blood cancer",
    "renal cancer",
    "heart cancer",
    "sinus cancer",
    "mouth cancer",
    "spine cancer",
    "Liver cancer",
    "lung cancer",
    "bone cancer",
    "skin cancer",
    "oral cancer",
    "anal cancer",
    "nose cancer",
    "heart attack following a cerebral haemorrhage",
    "heart attack during Olympic marathon trials",
    "suspected heart attack while hillwalking",
    "heart attack caused by anorexia nervosa",
    "cardiac arrest following a heart attack",
    "complications following a heart attack",
    "heart attack following a hunger strike",
    "heart attack brought about by diabetes",
    "heart attack caused by a drug overdose",
    "suffered a heart attack while swimming",
    "complications following a\xa0heart attack",
    "heart attack following spinal surgery",
    "heart attack due to pancreatic cancer",
    "apparent heart attack while teaching",
    "heart failure following heart attack",
    "brain haemorrhage after heart attack",
    "heart attack during cancer treatment",
    "complications from a heart attack",
    "heart attack triggered by asthma",
    "complications from heart attack",
    "complications of a heart attack",
    "heart attack following beating",
    "complications of heart attack",
    "series of small heart attacks",
    "heart attack aboard aircraft",
    "heart attack as a result of",
    "heart attack while jogging",
    "post surgery heart attack",
    "heart attack after race",
    "suspected heart attack",
    "apparent heart attack",
    "probable heart attack",
    "possible heart attack",
    "heart attack",
    "heart failure resulting from hypertrophic cardiomyopathy",
    "heart failure as a complication from cardiac surgery",
    "heart failure after surgery following a knockout",
    "heart failure after a botched suicide attempt",
    'known as "Crazy" Luke Graham; heart failure',
    "complications from congestive heart failure",
    "heart failure following accidental overdose",
    "heart failure caused by anorexia nervosa",
    "heart failure due to pulmonary embolism",
    "heart failure due to cardiac arrhythmia",
    "complications related to heart failure",
    "heart failure following heart attack",
    "pneumonia congestive heart failure",
    "heart failure related to pneumonia",
    "heart failure due to polymyositis",
    "complications from heart failure",
    "complications of heart failure",
    "progeria related heart failure",
    "hypertensive heart failure",
    "heart failure from sepsis",
    "congestive heart failure",
    "suspected heart failure",
    "apparent heart failure",
    "acute heart failure",
    "dheart failure",
    "heart failure",
    "liver failure reportedly complicated by COVID",
    "pulmonary fibrosis complicated by COVID",
    "cardiovascular illness related to COVID",
    "Parkinson disease complicated by COVID",
    "multiple organ failure caused by COVID",
    "kidney problems aggravated by COVID",
    "renal failure complicated by COVID",
    "kidney failure brought on by COVID",
    "sepsis as a complication of COVID",
    "long illness complicated by COVID",
    "liver cancer complicated by COVID",
    "pneumonia complicated by COVID",
    "heart complications from COVID",
    "viral pneumonia from COVID",
    "cardiac arrest from COVID",
    "complications from COVID",
    "COVID related pneumonia",
    "complications of COVID",
    "post COVID pneumonia",
    "pneumonia from COVID",
    "COVID",
    "hospitalized with pneumonia since his evacuation several days after",
    "pneumonia as a complication from a kidney infection",
    "pneumonia with complications from Alzheimer disease",
    "pneumonia as a complication of multiple myeloma",
    "pneumonia induced corticobasal degeneration",
    "pneumonia as a complication of lung cancer",
    "pneumonia as a complication from a stroke",
    "complications of pneumonia from surgery",
    "pneumonia as a complication of a stroke",
    "pneumonia as a complication of cancer",
    "bronchopneumonia following a stroke",
    "heart failure related to pneumonia",
    "pneumonia congestive heart failure",
    "respiratory failure from pneumonia",
    "complications following pneumonia",
    "complications of viral pneumonia",
    "pneumonia complicated by COVID",
    "complications from pneumonia",
    "pneumonia following a stroke",
    "complication from pneumonia",
    "complications of pneumonia",
    "viral pneumonia from COVID",
    "pneumonia related illness",
    "cancer related pneumonia",
    "COVID related pneumonia",
    "AIDS related pneumonia",
    "interstitial pneumonia",
    "aspiration pneumonia",
    "pneumonia from COVID",
    "hypostatic pneumonia",
    "post COVID pneumonia",
    "bronchial pneumonia",
    "bilateral pneumonia",
    "pleural pneumonia",
    "bronchopneumonia",
    "double pneumonia",
    "pneumonia during",
    "viral pneumonia",
    "acute pneumonia",
    "pneumonia",
    "stroke following decade long battle with breast cancer",
    "stroke as a complication of an aortic aneurysm",
    "cardiac arrest as a complication from a stroke",
    "stroke during treatment of pulmonary embolism",
    "stroke related to acute myelogenous leukemia",
    "stroke as a complication from heart surgery",
    "pneumonia as a complication from a stroke",
    "stroke from vertebral artery dissection",
    "pneumonia as a complication of a stroke",
    "complications from a series of strokes",
    "complications following a heat stroke",
    "complications of a stroke suffered in",
    "complications from multiple strokes",
    "bronchopneumonia following a stroke",
    "declining health following stroke",
    "complications following a stroke",
    "complications of massive stroke",
    "complications following strokes",
    "complications from heat stroke",
    "complications from heatstroke",
    "following a series of strokes",
    "cerebral atrophy from stroke",
    "pneumonia following a stroke",
    "complications after a stroke",
    "complications from a stroke",
    "aneurysm following a stroke",
    "complications from strokes",
    "complications of a stroke",
    "complications from stroke",
    "complications of strokes",
    "hemorrhagic brain stroke",
    "complications of stroke",
    "stroke following a fall",
    "consequences of stroke",
    "stroke complications",
    "died of a stroke in",
    "hemorrhagic stroke",
    "series of strokes",
    "multiple strokes",
    "watershed stroke",
    "cerebral stroke",
    "heat stroke",
    "heatstroke",
    "strokes",
    "stroke",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [12]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [13]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking remaining number of missing cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 10084 values in cause_of_death column.

CPU times: total: 3min 31s
Wall time: 3min 31s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [14]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    88120
2     9682
3      194
0       45
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [15]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [16]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [17]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "Olymp" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [18]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [19]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "Mr Olympia"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [20]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "member of the House of Representatives from Massachusetts th congressional district",
    "member of the House of Representatives from South Dakota nd congressional district",
    "member of the House of Representatives from Wisconsin rd congressional district",
    "member of the House of Representatives from Illinois rd congressional district",
    "member of the House of Representatives from Arkansas th congressional district",
    "member of the House of Representatives from Michigan rd congressional district",
    "member of the House of Representatives from Illinois th congressional district",
    "member of the House of Representatives for Wisconsin th congressional district",
    "member of the House of Representatives from Maryland th congressional district",
    "member of the House of Representatives from Indiana th congressional district",
    "Member of the House of Representatives from Indiana th congressional district",
    "member of the House of Representatives for Alabama th congressional district",
    "member of the House of Representatives from Ohio th congressional district",
    "member of the House of Representatives for Idaho nd congressional district",
    "member of the House of Representatives from Utah st congressional district",
    "having served as a Labour House of Representatives of Member of Parliament",
    "member of the House of Representatives from th congressional district",
    "member of the House of Representatives from st congressional district",
    "member of the House of Representatives for North Carolina th district",
    "Member of the House of Representatives from th congressional district",
    "House of Representatives from Wyoming at large congressional district",
    "member of the House of Representatives for th congressional district",
    "member of the House of Representatives for West Virginia rd district",
    "member of the House of Representatives for Vermont at large district",
    "member of the House of Representatives from Tennessee th district",
    "member of the House of Representatives from Virginia th district",
    "member of the House of Representatives from Arkansas rd district",
    "member of the House of Representatives for Minnesota th district",
    "member of the House of Representatives for Minnesota th District",
    "member of the House of Representatives for Maryland nd district",
    "member of the House of Representatives for Missouri th district",
    "Permanent Representative to the United Nations Office at Geneva",
    "member of the House of Representatives for Illinois nd district",
    "member of the House of Representatives from Montana nd district",
    "member of the House of Representatives for Maryland th District",
    "member of the House of Representatives from Hawaii nd district",
    "member of the House of Representatives for Alabama th district",
    "member of the House of Representatives from Nevada nd district",
    "four time member of the New Hampshire House of Representatives",
    "member of the House of Representatives for Hawaii st district",
    "longest serving female member of the House of Representatives",
    "member of the Jakarta Regional People Representative Council",
    "member of the House of Representatives from Iowa rd district",
    "member of the House of Representatives for Ohio th district",
    "Representative from West Virginia th congressional district",
    "member of the House of Representatives from New th district",
    "member of the House of Representatives for the st District",
    "member of the House of Representatives for New th district",
    "member of the House of Representatives from th districts",
    "Representative from the th District of Negros Occidental",
    "member of the House of Representatives from th district",
    "member of the House of Representatives from Michigan th",
    "member of the House of Representatives from Arkansas th",
    "member of the House of Representatives for Wisconsin th",
    "member of the House of Representatives from Illinois nd",
    "Representative from Tennessee rd congressional district",
    "member of the House of Representatives from Connecticut",
    "member of the House of Representatives for th district",
    "member of House of Representatives for the st district",
    "member of the House of Representatives from Arizona nd",
    "former Republican Representative from Illinois from to",
    "former Republican Representative from Michigan from to",
    "member of the House of Representatives for st district",
    "member of the North Carolina House of Representatives",
    "member of the House of Representatives from Minnesota",
    "Member of the North Carolina House of Representatives",
    "former Democratic Representative from Washington from",
    "former Republican Representative from Colorado served",
    "Speaker of the Massachusetts House of Representatives",
    "member of the Massachusetts House of Representatives",
    "member of the New Hampshire House of Representatives",
    "member of the House of Representatives from Colorado",
    "member of the House of Representatives of the Senate",
    "member of the House of Representatives for St George",
    "member of the House of Representatives from Michigan",
    "member of the House of Representatives from Missouri",
    "member of the House of Representatives from Illinois",
    "member of the South Dakota House of Representatives",
    "member of the North Dakota House of Representatives",
    "member of the House of Representatives from Indiana",
    "Member of the South Dakota House of Representatives",
    "former Republican Representative from Maine from to",
    "member of the House of Representatives for Illinois",
    "member of the Connecticut House of Representatives",
    "member of the House of Representatives for Gilmore",
    "member of the House of Representatives from Oregon",
    "member of the House of Representatives for Batanes",
    "member of the House of Representatives for Ohio st",
    "delegate to the House of Representatives from Guam",
    "member of the House of Representatives for Forrest",
    "Permanent Representative to the Economic Community",
    "member of the House of Representatives for Berowra",
    "member of the House of Representatives for Phillip",
    "member of the Washington House of Representatives",
    "member of House of Representatives from Wisconsin",
    "member of the House of Representatives for Cowper",
    "member of the House of Representatives for Sydney",
    "member of the House of Representatives for Oregon",
    "speaker of the Tennessee House of Representatives",
    "Speaker of the Tennessee House of Representatives",
    "member of North Carolina House of Representatives",
    "member of the Minnesota House of Representatives",
    "member of the Tennessee House of Representatives",
    "member of the Missouri Houses of Representatives",
    "member of the House of Representatives from Ohio",
    "member of the House of Representatives from Utah",
    "Speaker of the Missouri House of Representatives",
    "member of the House of Representatives for Bohol",
    "member of the Illinois House of Representatives",
    "member of the Missouri House of Representatives",
    "member of the Michigan House of Representatives",
    "member of the Arkansas House of Representatives",
    "member of the Oklahoma House of Representatives",
    "member of the Colorado House of Representatives",
    "member of the Delaware House of Representatives",
    "Senator of the Indiana House of Representatives",
    "Permanent Representative of Polisario to the UN",
    "member of the House of Representatives for Reid",
    "member of the House of Representatives from New",
    "former liberal Democrat Representative from and",
    "former Republican Representative from Iowa from",
    "member of the House of Representatives for Ohio",
    "last Doorkeeper of the House of Representatives",
    "member of South Dakota House of Representatives",
    "member of the Alabama House of Representatives",
    "member of the Wyoming House of Representatives",
    "member of the Arizona House of Representatives",
    "member of the Indiana House of Representatives",
    "member of the Vermont House of Representatives",
    "member of the Montana House of Representatives",
    "member of the House of Representatives from th",
    "Permanent Representative to the United Nations",
    "Speaker of the Hawaii House of Representatives",
    "senator of the Alaska House of Representatives",
    "Member of the Vermont House of Representatives",
    "member of the House of Representatives from rd",
    "Member of the House of Representatives from th",
    "first Delegate to the House of Representatives",
    "former Democratic Representative from Michigan",
    "Speaker of the Alaska House of Representatives",
    "member of the Alaska House of Representatives",
    "member of the Kansas House of Representatives",
    "member of the Oregon House of Representatives",
    "member of the Hawaii House of Representatives",
    "member of the House of Representatives for th",
    "former member of the House of Representatives",
    "member of the Nevada House of Representatives",
    "Member of the Hawaii House of Representatives",
    "member of the Idaho House of Representatives",
    "member of the Maine House of Representatives",
    "Member of Minnesota House of Representatives",
    "member of the House of Representatives since",
    "Democratic Congressional Representative from",
    "Speaker of the Utah House of Representatives",
    "member of the Ohio House of Representatives",
    "member of the Iowa House of Representatives",
    "member of the Utah House of Representatives",
    "member of the House of Representatives from",
    "Member of the Ohio House of Representatives",
    "member of the People Representative Council",
    "member of Oklahoma House of Representatives",
    "husband of Representative Carolyn B Maloney",
    "member of the New House of Representatives",
    "member of the P R House of Representatives",
    "former Republican Representative from from",
    "member of the House of Representatives for",
    "member of Wyoming House of Representatives",
    "president of the House of Representatives",
    "member of Hawaii House of Representatives",
    "Representative from Minnesota th district",
    "member of the House of Representatives of",
    "Historian of the House of Representatives",
    "member of the Chamber of Representatives",
    "member of the Council of Representatives",
    "member of Maine House of Representatives",
    "Representative from Illinois th district",
    "Delegate to the House of Representatives",
    "first Permanent Representative to the UN",
    "Speaker of the House of Representatives",
    "speaker of the House of Representatives",
    "member of the Houses of Representatives",
    "member of House of Representatives from",
    "member of the House of Representatives",
    "Member of the House of Representatives",
    "Massachusetts House of Representatives",
    "former Democratic Representative from",
    "former Republican Representative from",
    "Speaker of House of Representatives",
    "member of House of Representatives",
    "Representative from North Carolina",
    "Tennessee House of Representatives",
    "Representative from New Hampshire",
    "Permanent Representative to NATO",
    "Representative from North Dakota",
    "Representative from South Dakota",
    "Representative from Connecticut",
    "Representative for Connecticut",
    "Republican Representative from",
    "Representative from Wisconsin",
    "Utah House of Representatives",
    "Representative from Tennessee",
    "Arkansas State Representative",
    "Colorado State Representative",
    "Representative from Illinois",
    "Representative from Virginia",
    "Representative from Michigan",
    "Representative from Missouri",
    "Representative from Maryland",
    "Representative from Indiana",
    "Representative Co Prince of",
    "Representative for Michigan",
    "Representative from Arizona",
    "Representative from Alabama",
    "Representative from Vermont",
    "Kansas State Representative",
    "Representative for Virginia",
    "Representative from Hawaii",
    "Representative from Alaska",
    "Representative from Oregon",
    "Representative from Kansas",
    "Representative from Maine",
    "Representative from Ohio",
    "Representative from Iowa",
    "Representative from New",
    "Representative for Ohio",
    "Representative for Utah",
    "Representative for Iowa",
    "Arkansas Representative",
    "State Representative",
    "Representative elect",
    "Trade Representative",
    "Queen Representative",
    "Representative from",
    "Representative for",
    "Representative",
    "co writer of the Endangered Species Act of",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "first female writer awarded full press credentials at",
    "member of the Nashville Songwriters Hall of Fame",
    "a prolific writer of language text books",
    "widow of science fiction writer",
    "award winning Ontarian writer",
    "and television screenwriter",
    "presidential speechwriter",
    "speech writer",
    "detective story writer",
    "songwriter arranger",
    "nationalized writer",
    "short story writer",
    "esotericism writer",
    "non‑fiction writer",
    "singer songwriter",
    "television writer",
    "songwriter winner",
    "technical writer",
    "cookbook writer",
    "children writer",
    "dialogue writer",
    "fiction writer",
    "and songwriter",
    "travel writer",
    "script writer",
    "comics writer",
    "comedy writer",
    "screenwriter",
    "scriptwriter",
    "music writer",
    "story writer",
    "food writer",
    "songwriter",
    "and writer",
    "art writer",
    "Director General of the Olympia",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "only Olympian handball player to represent three countries",
    "won the gold medal in hurdles at the Summer Olympics",
    "winner of seven Olympic gold medals for the Union",
    "president of the International Olympic Committee",
    "bronze medalist in the Summer Olympics marathon",
    "International Olympic Committee whistleblower",
    "first black woman to become Olympic champion",
    "national team member for the Winter Olympics",
    "President of the National Olympic Committee",
    "most medaled athlete at the Summer Olympics",
    "gold medallist in the Winter Olympic Games",
    "winner of the Olympic demonstration event",
    "led push to return tennis to Olympics in",
    "represented in soccer at Summer Olympics",
    "brother of Olympic Judo star Neil Adams",
    "silver medallist at the Summer Olympics",
    "silver medalist at the Summer Olympics",
    "gold medallist at the Summer Olympics",
    "Olympic gold medallist for the Union",
    "lit cauldron at the Summer Olympics",
    "president of Olympique de Marseille",
    "president of the Olympic Committee",
    "represented in six Chess Olympiads",
    "longest living Olympic competitor",
    "Chairman of the Olympic Committee",
    "gold medallist at Summer Olympics",
    "oldest known Olympic medal winner",
    "two time Olympic silver medalist",
    "Olympic gold medal winning boxer",
    "double medallist at the Olympics",
    "Olympics high jump gold medalist",
    "Winter Olympics silver medalist",
    "three time Olympic medal winner",
    "founder of the Special Olympics",
    "four time Olympic gold medalist",
    "Summer Olympics silver medalist",
    "president of Olympic Committee",
    "Olympic double silver medalist",
    "oldest living Olympic champion",
    "first female Olympic champion",
    "seven time Olympic competitor",
    "Youth Olympic silver medalist",
    "Olympic middleweight champion",
    "Olympic bronze medal winner",
    "three time Olympic champion",
    "Olympic silver medal winner",
    "twice Olympic gold medalist",
    "Olympic champion in javelin",
    "Olympic champion under \xa0kg",
    "first Olympic medal winner",
    "multiple Olympic medallist",
    "four time Olympic champion",
    "two time Olympic champion",
    "fourfold Olympic champion",
    "Olympic bronze medallist",
    "Olympic silver medallist",
    "Olympic silver medalist",
    "Olympic bronze medalist",
    "double Olympic champion",
    "triple Olympic champion",
    "Olympic gold medallist",
    "member of Olympic team",
    "Olympic sports shooter",
    "Youth Olympic champion",
    "Olympics gold medalist",
    "Olympic relay champion",
    "Olympic gold medalist",
    "Olympic wrestler for",
    "Olympic silver medal",
    "five time Olympian",
    "Olympic medallist",
    "Olympic champion",
    "Olympic medalist",
    "Olympic Champion",
    "Olympic silver",
    "triple Olympic",
    "Olympic gold",
    "Mr Olympia",
    "Olympian",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "inventor of the implantable cardiac pacemaker",
    "developed the Rho immune globulin vaccine for Rh disease",
    "discoverer of Kawasaki disease",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "parachutist at Summer Olympics opening ceremony",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "rescued people from suicide",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "planned Summer Olympics Munich massacre",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "disguised as female to compete for y at Summer Olympics",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "breast cancer",
    "of cancer",
    "cancer",
    "shot with pepper spray projectile by Boston Police",
    "killed by a head shot together with friend",
    "shot whilst covering the Oaxaca protests",
    "shot during attack on José Ramos Horta",
    "shot down during the Battle of Kyiv",
    "self inflicted gunshot to the head",
    "complications from gunshot wounds",
    "shot in the Rajnandgaon ambush",
    "shot by his Chief of Security",
    "suspected suicide by gunshot",
    "gunshot by Brandon McInerney",
    "shot during domestic dispute",
    "apparent suicide by gunshot",
    "possible suicide by gunshot",
    "shot by the Defense Forces",
    "shot by Special Task Force",
    "shot during prison escape",
    "complications of gunshot",
    "shot by record producer",
    "shot during carjacking",
    "shot by police officer",
    "injuries from gunshot",
    "shot in an army raid",
    "homicide by gunshot",
    "shot during robbery",
    "gunshot to the head",
    "shot by the Army in",
    "suicide by gunshot",
    "accidental gunshot",
    "shot while hunting",
    "shot dead in Basra",
    "shot in East Timor",
    "murder by gunshot",
    "shot in Mogadishu",
    "shot by soldiers",
    "shot by the IRA",
    "shot by robbers",
    "shot by police",
    "gunshot wounds",
    "shot to death",
    "gunshot wound",
    "shot times",
    "shot dead",
    "shot down",
    "gunshot",
    "shot",
    "complications of cerebral hemorrhage from traffic collision",
    "complications from a race collision during Hours of Le Mans",
    "complications from injuries sustained in traffic collision",
    "multiple organ failure following traffic collision",
    "complications from a collision during competition",
    "head injuries sustained in a traffic collision",
    "cardiac arrest due to collision with teammate",
    "head injuries sustained in a race collision",
    "blunt force neck injury from race collision",
    "head injury sustained in traffic collision",
    "injuries sustained in a traffic collision",
    "injuries received in a traffic collision",
    "injuries sustained in traffic collision",
    "cardiac arrest after on field collision",
    "complications from a traffic collision",
    "spinal injuries from traffic collision",
    "injuries sustained in a race collision",
    "complications after traffic collision",
    "complications from traffic collision",
    "motor vehicle collision with a moose",
    "head injuries from traffic collision",
    "injuries from a traffic collision",
    "head injuries from race collision",
    "injuries from traffic collision",
    "injuries sustained in collision",
    "vehicle pedestrian collision",
    "suicide by traffic collision",
    "collision during practice",
    "base jumping collision",
    "motorcycle collision",
    "dirt bike collision",
    "racetrack collision",
    "bobsleigh collision",
    "boat race collision",
    "traffic collision ·",
    "training collision",
    "wingsuit collision",
    "traffic collision",
    "tractor collision",
    "cycling collision",
    "vehicle collision",
    "bicycle collision",
    "balloon collision",
    "plane collision",
    "train collision",
    "stunt collision",
    "race collision",
    "boat collision",
    "ATV collision",
    "car collision",
    "cardiac arrest stemming from decompression sickness",
    "cardiac arrhythmia stemming from atherosclerosis",
    "cardiac arrest due to abdominal aortic aneurysm",
    "cardiac arrest due to collision with teammate",
    "possible cardiac arrest during Dakar Rally",
    "brain hemorrhage following cardiac arrest",
    "cardiac arrest after a spinal cord injury",
    "complications following cardiac surgery",
    "cardiac arrest from respiratory failure",
    "cardiac arrest after on field collision",
    "complications from a cardiac condition",
    "complications following cardiac arrest",
    "cardiac arrest following car accident",
    "complications of a cardiac condition",
    "complications from cardiac surgery",
    "cardiac arrest as a result of COPD",
    "complications from cardiac arrest",
    "complications of cardiac surgery",
    "complications of cardiac arrest",
    "cardiac arrest during surgery",
    "cardiac arrest due to sepsis",
    "cardiac arrest from shooting",
    "pulmonary cardiac arrest",
    "cardiac complications",
    "cardiac arrest due to",
    "cardiac amyloidosis",
    "cardiac dysfunction",
    "cardiac dysrhythmia",
    "cardiac arrhythmia",
    "cardiac arrest and",
    "cardiac arrythmia",
    "cardiac aneurysm",
    "cardiac problems",
    "cardiac failure",
    "cardiac ailment",
    "cardiac illness",
    "cardiac arrest",
    "chronic kidney disease caused by type diabetes",
    "of natural causes after a lengthy illness",
    "respiratory failure caused by brain tumor",
    "discovered cause of sickle cell anemia",
    "liver failure caused by Hepatitis C",
    "brain death caused by knockout",
    "complications caused by AIDS",
    "apparent natural causes",
    "AIDS related causes",
    "natural causes",
    "unknown causes",
    "body found on this date after suicide by carbon monoxide poisoning",
    "possible suicide by overdose of prescription painkillers",
    "and became the West first woman suicide bomber",
    "fall from height ruled a suicide by the police",
    "suspected suicide by carbon monoxide poisoning",
    "suicide by overdose of prescription medication",
    "apparent suicide by carbon monoxide poisoning",
    "suicide by self inflicted blunt force trauma",
    "suspected suicide by fall from a building",
    "suicide by carbon monoxide inhalation",
    "suicide by carbon monoxide poisoning",
    "intercepted suicide bomber at school",
    "assisted suicide by lethal injection",
    "apparent suicide in front of a train",
    "suicide by blunt force head injury",
    "killed by suicide bomb in Northern",
    "suicide by inert gas asphyxiation",
    "suicide by jumping from building",
    "apparent suicide by dehydration",
    "injuries from a suicide attempt",
    "suicide by barbiturate overdose",
    "suicide by self defenestration",
    "suicide by jumping from bridge",
    "suicide by jumping from cliff",
    "suspected suicide by hanging",
    "suspected suicide by gunshot",
    "suicide by alcohol poisoning",
    "suicide by helium inhalation",
    "suicide by traffic collision",
    "suicide by cyanide poisoning",
    "suicide by grenade explosion",
    "apparent suicide by stabbing",
    "apparent suicide by hanging",
    "apparent suicide by gunshot",
    "apparent suicide by jumping",
    "suspected suicide by poison",
    "possible suicide by gunshot",
    "physician assisted suicide",
    "suicide by self immolation",
    "suicide prior to execution",
    "suicide by throat cutting",
    "victim of suicide bombing",
    "suicide by defenestration",
    "suicide by drug overdose",
    "charcoal burning suicide",
    "suicide by strangulation",
    "suicide by wrist cutting",
    "suicide by asphyxiation",
    "suicide by train impact",
    "suicide by hand grenade",
    "suicide by poisoning",
    "suicide by car crash",
    "suicide by drowning",
    "suicide by overdose",
    "suicide bomb attack",
    "suicide by stabbing",
    "suicide bomb victim",
    "suicide by hanging",
    "suicide by gunshot",
    "suicide by jumping",
    "suicide by fasting",
    "officially suicide",
    "suicide by alcohol",
    "suspected suicide",
    "assisted suicide",
    "apparent suicide",
    "suicide by train",
    "suicide by pilot",
    "possible suicide",
    "suicide bombing",
    "suicide by fire",
    "suicide by drug",
    "suicide attack",
    "murder suicide",
    "suicide by",
    "Complications of liver disease",
    "hypertensive atherosclerotic cardiovascular disease",
    "complications of a chronic neurological disease",
    "complications from Charcot Marie Tooth disease",
    "complications from peripheral vascular disease",
    "chronic kidney disease caused by type diabetes",
    "complications of Charcot Marie Tooth disease",
    "complications from polycystic kidney disease",
    "complications related to Alzheimer disease",
    "complications from a neuromuscular disease",
    "complications from cardiovascular disease",
    "lung complications from Alzheimer disease",
    "complications from motor neurone disease",
    "active euthanasia for pulmonary disease",
    "complications related to kidney disease",
    "from complications of Parkinson disease",
    "atherosclerotic cardiovascular disease",
    "complications from Parkinson’s disease",
    "complications from respiratory disease",
    "euthanization following a lung disease",
    "chronic obstructive pulmonary disease",
    "liver disease complicated by diabetes",
    "complications from Parkinsons disease",
    "complications from Alzheimer disease",
    "complications from Parkinson disease",
    "euthanized following adrenal disease",
    "complications from Lewy body disease",
    "f complications of Alzheimer disease",
    "complications from pulmonary disease",
    "complications of respiratory disease",
    "complications of a vascular disease",
    "complications of Parkinson disease",
    "complications of Alzheimer disease",
    "complications due to heart disease",
    "complications of Lewy body disease",
    "complications from kidney disease",
    "complication of Parkinson disease",
    "complications from heart disease",
    "complications from liver disease",
    "pulmonary veno occlusive disease",
    "combination of Parkinson disease",
    "complications from lung disease",
    "complications of kidney disease",
    "gastroesophageal reflux disease",
    "complications of heart disease",
    "arteriosclerotic heart disease",
    "complications of Crohn disease",
    "complications of liver disease",
    "cardiovascular renal disease",
    "hypertensive heart disease",
    "hypertrophic heart disease",
    "degenerative brain disease",
    "Creutzfeldt Jakob disease",
    "interstitial lung disease",
    "neurodegenerative disease",
    "cerebral vascular disease",
    "inflammatory lung disease",
    "acute respiratory disease",
    "chronic pulmonary disease",
    "degenerative lung disease",
    "congenital heart disease",
    "gastrointestinal disease",
    "cerebrovascular disease",
    "coronary artery disease",
    "cardiopulmonary disease",
    "ischaemic heart disease",
    "Erdheim Chester disease",
    "discovered Lyme disease",
    "suspected heart disease",
    "cardiovascular disease",
    "chronic kidney disease",
    "coronary heart disease",
    "ischemic heart disease",
    "motor neurone disease",
    "legionnaires' disease",
    "chronic liver disease",
    "meningococcal disease",
    "motor neuron disease",
    "neurological disease",
    "degenerative disease",
    "Lou Gehrig’s disease",
    "respiratory disease",
    "ebola virus disease",
    "Parkinson’s disease",
    "Ebola virus disease",
    "undisclosed disease",
    "Legionnaire disease",
    "sickle cell disease",
    "Lou Gehrig disease",
    "Huntington disease",
    "infectious disease",
    "autoimmune disease",
    "intestinal disease",
    "Alzheimer disease",
    "Parkinson disease",
    "pulmonary disease",
    "alzheimer disease",
    "vascular disease",
    "coronary disease",
    "Addison disease",
    "kidney disease",
    "muscle disease",
    "heart disease",
    "liver disease",
    "blood disease",
    "renal disease",
    "Crohn disease",
    "nerve disease",
    "brain disease",
    "lung disease",
    "Pick disease",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [21]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [22]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 17342 values in cause_of_death column.

CPU times: total: 3min 24s
Wall time: 3min 24s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [23]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    88099
2     9704
3      195
0       43
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [None]:
# Obtaining values for column and their counts
roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_cause_list.pop()

In [None]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "general" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [None]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

#### Checking Updated `num_categories` Value Counts

In [None]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

In [None]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

#### Finding `known_for` Roles and `cause_of_death` in `info_3_0`

In [None]:
# Obtaining values for column and their counts
roles_cause_list = df["info_3_0"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_cause_list.pop()

In [None]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_0"].notna()].index
#             if "general" in df.loc[index, "info_3_0"]
#         ],
#         "info_3_0",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_3_0"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_0`

In [None]:
%%time

# Column to check
column = 'info_3_0'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                df.loc[index, 'cause_of_death'] = cause
                df.loc[index, column] = item.replace(cause, '').strip()
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

#### Checking Updated `num_categories` Value Counts

In [None]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean6.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean6.db")
# df.to_sql("wp_life_expect_clean", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part ]()