# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 7](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean7_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_clean6.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean6.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean6", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98040 rows and 43 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,director,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,,,and academic,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,1,0,0,0,2


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98038,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",50.0,,Pakistan,,"2002 2007, since 2018",4.60517,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,1,0,0,0,2
98039,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
89993,14,Kanaka Murthy,", 78, Indian sculptor, COVID-19.",https://en.wikipedia.org/wiki/Kanaka_Murthy,13,2021,May,,78.0,COVID,India,,,2.639057,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
10179,28,Eric Bryant,", 63, British cricket player.",https://en.wikipedia.org/wiki/Eric_Bryant_(cricketer),5,1999,November,,63.0,,United Kingdom of Great Britain and Northern Ireland,,,1.791759,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,0,0,0,0,1
78981,28,Mohammad Munaf,", 84, Pakistani cricketer .",https://en.wikipedia.org/wiki/Mohammad_Munaf_(cricketer),8,2020,January,national team,84.0,,Pakistan,,national team,2.197225,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,0,0,0,0,1
21099,9,Erik Elmsäter,", 86, Swedish athlete, first Swede to compete in both Summer and Winter Olympics.",https://en.wikipedia.org/wiki/Erik_Elms%C3%A4ter,4,2006,March,,86.0,,Sweden,,,1.609438,Winter Olympics,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,0,0,0,0,1
80945,16,William Pulgram,", 99, Austrian-born American architect.",https://en.wikipedia.org/wiki/William_Pulgram,21,2020,April,,99.0,,Austria,United States of America,,3.091042,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98040 entries, 0 to 98039
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98040 non-null  object 
 1   name                       98040 non-null  object 
 2   info                       98040 non-null  object 
 3   link                       98040 non-null  object 
 4   num_references             98040 non-null  int64  
 5   year                       98040 non-null  int64  
 6   month                      98040 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98040 non-null  float64
 9   cause_of_death             25122 non-null  object 
 10  place_1                    97887 non-null  object 
 11  place_2                    8116 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98040 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for and `cause_of_death` values.
- To avoid overwriting any existing `cause_of_death` values that might flow into 2 columns, such as, "heart and lung problems", we will update the code to combine the values.

### Extracting Remaining `known_for` and `cause_of_death` Values Continued

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [6]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if "diabetes" in df.loc[index, "info_3_1"]
#             #             and df.loc[index, "politics_govt_law"] == 0
#             #             and df.loc[index, "arts"] == 1
#             #             and df.loc[index, "sciences"] == 0
#             #             and df.loc[index, "spiritual"] == 0
#             #             and df.loc[index, "law_enf_military_operator"] == 0
#             #             and df.loc[index, "sports"] == 0
#             #             and df.loc[index, "event_record_other"] == 0
#             #             and df.loc[index, "crime"] == 0
#             #             and df.loc[index, "academia_humanities"] == 0
#             #             and df.loc[index, "business_farming"] == 0
#             #                         and df.loc[index, "other_species"] == 0
#             #             and df.loc[index, "num_categories"] == 0
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == "diabetes"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [11]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "an National Congress politician",
    "politician Sir Abe Bailey",
    "Conservative politician",
    "neo fascist politician",
    "provincial politician",
    "former politician",
    "politician",
    "author of the Declaration of Independence of",
    "editorial director of Antiwar com",
    "Housing",
    "housing",
    "medical education",
    "activist against animal testing",
    "activist for victims of Nazism",
    "transgender rights activist",
    "anti apartheid activist",
    "anti Communist activist",
    "lesbian rights activist",
    "environmental activist",
    "animal rights activist",
    "Black lesbian activist",
    "human rights activist",
    "civil rights activist",
    "women rights activist",
    "LGTBI rights activist",
    "Lingayatism activist",
    "health care activist",
    "LGBT rights activist",
    "gay rights activist",
    "political activist",
    "anti war activist",
    "feminist activist",
    "cannabis activist",
    "internet activist",
    "cultural activist",
    "Garifuna activist",
    "consumer activist",
    "literary activist",
    "Marxist activist",
    "social activist",
    "public activist",
    "Native activist",
    "gender activist",
    "rights activist",
    "LGBTQ activist",
    "peace activist",
    "AIDS activist",
    "diplomat",
    "outspoken critic of President Vladimir Putin",
    "government critic",
    "social critic",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "songwriter in the country music genre",
    "writer of children books",
    "funk singer songwriter",
    "science fiction writer",
    "writer of literature",
    "television writer",
    "singer songwriter",
    "screenplay writer",
    "nonfiction writer",
    "children writer",
    "fantasy writer",
    "fiction writer",
    "travel writer",
    "comics writer",
    "screenwriter",
    "scriptwriter",
    "songwriter",
    "was the author of two books on Parliament",
    "author of more than music instruction books",
    "popular science author",
    "book author",
    "children book author",
    "relationship author",
    "best selling author",
    "short story author",
    "children author",
    "cookbook author",
    "textbook author",
    "actor land National Artist",
    "big screen movie actor",
    "television actor",
    "theatre actor",
    "voice actor",
    "child actor",
    "stage actor",
    "film actor",
    "actor",
    "independent producer",
    "television producer",
    "theatrical producer",
    "R&B record producer",
    "theatre producer",
    "record producer",
    "music producer",
    "blues producer",
    "movie producer",
    "film producer",
    "BBC producer",
    "TV producer",
    "television producer",
    "video game journalist",
    "literary journalist",
    "music journalist",
    "art journalist",
    "television composer",
    "film score composer",
    "music composer",
    "samba composer",
    "film composer",
    "executive director of the Producers Guild of",
    "artistic director of the National Ballet of",
    "former deputy managing director of",
    "documentary film director",
    "pornographic director",
    "television director",
    "artistic director",
    "theatre director",
    "dubbing director",
    "theater director",
    "musical director",
    "stage director",
    "music director",
    "opera director",
    "film director",
    "wife of singing cowboy Roy Rogers",
    "singer for The Lovin' Spoonful",
    "funk singer songwriter",
    "musical comedy singer",
    "singer songwriter",
    "rockabilly singer",
    "jíbaro singer",
    "bolero singer",
    "reggae singer",
    "opera singer",
    "funk singer",
    "pop singer",
    "singer",
    "Urdu poet",
    "photography historian",
    "multi instrumental session musician",
    "musician on early Tamla sessions",
    "jazz musician",
    "soul musician",
    "R&B musician",
    "a musician",
    "musician",
    "playwright",
    "theatre activist",
    "son of novelist Ernest Hemingway",
    "novelist",
    "novels",
    "broadcaster on public media",
    "radio broadcaster",
    "broadcaster",
    "podcaster",
    "literary magazine editor",
    "chief editorialist of",
    "contributing editor",
    "editor in chief of",
    "thesaurus editor",
    "newspaper editor",
    "magazine editor",
    "first editor of",
    "graphic editor",
    "story editor",
    "editor of",
    "editor",
    "film critic for The Times",
    "literary critic",
    "theater critic",
    "theatre critic",
    "music critic",
    "film critic",
    "art critic",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "rules writer",
    "automobile journalist",
    "radio sports journalist",
    "athletic director",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "rose authority",
    "flight director in the Mission Control Center at Johnson Space Center",
    "the first director of Institute for Biological Research",
    "managing director of ONGC",
    "reprocessing",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "agricultural producer",
    "executive director of the Mori Building Company",
    "managing director of AmBank",
    "Housing Association",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "authority on",
    "director of the Curtis Institute of Music",
    "director of the Head Start Program",
    "director of Georgetown Day School",
    "director of the Planetarium",
    "museum director",
    "academic; Chancellor of the University of Dublin",
    "an academician of the Academy of Sciences",
    "academic administrator",
    "academician",
    "academic",
    "poetry teacher",
    "translator of ancient epics",
    "translator of literature",
    "comic book translator",
    "translator of",
    "translator",
    "literary historian",
    "language historian",
    "ethno historian",
    "urban historian",
    "art historian",
    "historian of medieval and",
    "historic preservationist",
    "ecclesiastical historian",
    "military historian",
    "culinary historian",
    "science historian",
    "naval historian",
    "film historian",
    "music educator",
    "art educator",
    "educationist",
    "education",
    "educator",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "prominent Latter day Saint author",
    "church music composer",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = ["philanthropist"]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "complications of pneumonia",
    "pneumonia",
    "congestive heart failure",
    "multiple organ failure",
    "chronic renal failure",
    "respiratory failure",
    "pulmonary failure",
    "cardiac failure",
    "kidney failure",
    "heart failure",
    "renal failure",
    "liver failure",
    "organ failure",
    "lung failure",
    "kidney complications",
    "kidney problems",
    "kidney ailments",
    "kidney disease",
    "kidney cancer",
    "complications from COVID",
    "COVID",
    "complications of diabetes",
    "diabetes",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [12]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [13]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25123 values in cause_of_death column.

CPU times: total: 22 s
Wall time: 22.7 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [14]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84980
2    12537
3      510
0        8
4        5
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [15]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [16]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [17]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if "House" in df.loc[index, "info_3_1"]
#             #             and df.loc[index, "politics_govt_law"] == 1
#             #             and df.loc[index, "arts"] == 0
#             #             and df.loc[index, "sciences"] == 0
#             #             and df.loc[index, "spiritual"] == 0
#             #             and df.loc[index, "law_enf_military_operator"] == 0
#             #             and df.loc[index, "sports"] == 1
#             #             and df.loc[index, "event_record_other"] == 0
#             #             and df.loc[index, "crime"] == 0
#             #             and df.loc[index, "academia_humanities"] == 0
#             #             and df.loc[index, "business_farming"] == 0
#             #                         and df.loc[index, "other_species"] == 0
#             #             and df.loc[index, "num_categories"] == 0
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [18]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [19]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == "Crime Commissioner"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [20]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "activist",
    "lawyer",
    "colonial administrator",
    "life peer",
    "peer",
    "convicted dissident",
    "political dissident",
    "Castro dissident",
    "dissident",
    "Technology Policy",
    "Labrador MHA for Conception Bay East",
    "Labrador MHA",
    "conservationist",
    "Labrador House of Assembly member for St George",
    "Labrador House of Assembly for Gander",
    "Labrador House of Assembly",
    "Communications",
    "United Nations civil servant",
    "public servant",
    "civil servant",
    "attorney",
    "Finance",
    "environmentalist",
    "Majority Leader of the NY Senate",
    "deputy president of the Senate",
    "member of the State Senate",
    "Senator",
    "Senate",
    "senior judge of the District Court for the Central District of",
    "High Court judge",
    "th congressional districts",
    "nd congressional districts",
    "congressman",
    "feminist",
    "Trademark Office",
    "Foreign Trade",
    "Trade",
    "Budget",
    "ambassador to",
    "ambassador",
    "founder of the Saba Labour Party",
    "Labour Party donor",
    "Labour",
    "Health Service Commissioner",
    "Islamic Affairs; Member of the House of Representatives",
    "four time member of the House of Representatives",
    "Labrador House of Assembly member for St George",
    "Speaker of the Maine House of Representatives",
    "th Speaker of the House of Representatives",
    "Labrador House of Assembly for Gander",
    "member of the House of Lords",
    "Christel House International",
    "Labrador House of Assembly",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "writer",
    "author of",
    "author",
    "producer",
    "journalist",
    "composer",
    "director",
    "poet",
    "critic",
    "newspaper publisher",
    "music publisher",
    "book publisher",
    "publisher",
    "arranger of music for television",
    "music arranger",
    "arranger",
    "essayist",
    "television presenter",
    "radio presenter",
    "TV presenter",
    "presenter",
    "Nine Network television commentator",
    "arranger of music for television",
    "reality television cast member",
    "television hostess on cuisine",
    "television quiz contestant",
    "television talk show host",
    "reality television judge",
    "television personality",
    "CBS television network",
    "television presenter",
    "television announcer",
    "television executive",
    "television actress",
    "television host",
    "television",
    "photographer",
    "manager of actress Brooke Shields",
    "mother of actress Candice Bergen",
    "Oscar winning film actress",
    '"B" actress in the s',
    "television actress",
    "stage actress",
    "voice actress",
    "film actress",
    "actress",
    "multimedia fine artist",
    "record jacket artist",
    "installation artist",
    "performance artist",
    "comic strip artist",
    "recording artist",
    "make up artist",
    "digital artist",
    "plastic artist",
    "variety artist",
    "graphic artist",
    "visual artist",
    "comics artist",
    "escape artist",
    "light artist",
    "cyber artist",
    "mime artist",
    "sculptor",
    "guitarist",
    "dancer",
    "botanical illustrator",
    "book illustrator",
    "illustrator",
    "paintings of World War II",
    "painter",
    "orchestra conductor",
    "conductor",
    "Rolling Stones manager",
    "country music manager",
    "manager of Five Star",
    "cultural manager",
    "theatre manager",
    "stage manager",
    "jazz pianist",
    "pianist",
    "Oscar winning film actress",
    "conspiracy film maker",
    "underwater filmmaker",
    "film set designer",
    "sound film star",
    "film actress",
    "film censor",
    "film music",
    "film maker",
    "filmmaker",
    "film",
    "lyricist",
    "syndicated columnist",
    "columnist for the",
    "columnist",
    "administrator of the Old Globe Theatre",
    "musical administrator",
    "music administrator",
    "arts administrator",
    "acting coach",
    "vocal coach",
    "leader of comedic group The Drifters",
    "stand up comedian",
    "comedian",
    "last surviving husband",
    "Sonora Ponceña bands",
    "bandleader",
    "agent of TV entertainer Michael Barrymore",
    "comic entertainer",
    "entertainer",
    "radio personality",
    "radio presenter",
    "radio announcer",
    "radio host",
    "radio DJ",
    "radio show host",
    "art collector",
    "collector",
    "fashion model",
    "model",
    "record industry executive",
    "broadcasting executive",
    "record label executive",
    "studio executive",
    "chief executive",
    "news executive",
    "reality television judge",
    "competition judge",
    "socialite",
    "blogger",
    "choreographer",
    "performance artist",
    "cabaret performer",
    "stunt performer",
    "comic performer",
    "performer",
    "autobiographer",
    "biographer",
    "dramaturge",
    "dramatist",
    "reporter",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "martial artist",
    "t'ai chi teacher",
    "Capirossi team manager",
    "booking manager",
    "general manager",
    'last survivor of the Philadelphia Phillies\' "Whiz Kids" that won the National League championship',
    "athletics administrator",
    "games administrator",
    "former coach of golfer Tiger Woods",
    "basketball coach",
    "assistant coach",
    "football coach",
    "bandy player",
    "scout",
    "sports executive",
    "vice president of the International Olympic Committee",
    "two bronze medals at the Summer Olympics",
    "Olympic silver medallist",
    "Olympic figure skater",
    "silver medal Olympiad",
    "Olympic champion",
    "National Olympic",
    "Olympic sprinter",
    "Olympic medalist",
    "Olympic official",
    "Olympic athlete",
    "Winter Olympics",
    "Olympic gymnast",
    "Olympic fencer",
    "Olympic sailor",
    "Olympian",
    "Olympics",
    "MMA fighter",
    "speedway entrepreneur",
    "polar explorer",
    "explorer",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "Technology Beijing",
    "Technology Prize",
    "radio amateur",
    "various radio developments",
    "radio astronomy",
    "Director of the MIT Laboratory for Computer Science from to",
    "Biomedical Sciences",
    "sociologist",
    "mathematician",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "business executive",
    "small business",
    "businesswoman",
    "businessman",
    "hospital administrator",
    "business executive",
    "retail executive",
    "entrepreneur",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "historian",
    "philosophy professor",
    "philosopher",
    "professor at George Washington University",
    "constitutional law professor",
    "professor of literature",
    "professor of medicine",
    "university professor",
    "philosophy professor",
    "professor of music",
    "college professor",
    "professor",
    "scholar of medieval literature",
    "Shakespeare scholar",
    "literary scholar",
    "scholar",
    "university administrator",
    "ethnomusicologist",
    "musicologist",
    "art lecturer",
    "lecturer",
    "museum curator",
    "curator",
    "founder of the Boston Museum of Science",
    "member of the Academy of Sciences",
    "the Academy of Social Sciences",
    "The World Academy of Sciences",
    "theatrical pedagogue",
    "pedagogue",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "son of a professor who became a rebel leader",
    "resistance fighter during World War II",
    "guerrilla fighter",
    "freedom fighter",
    "fighter ace",
    "architect of the Air Force space",
    "Crime Commissioner",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = ["islamic theologist", "theologian"]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "charitable organization executive",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = ["convicted murderer", "murderer"]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = ["concentration camp survivor", "Holocaust survivor"]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = ["sire"]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "prostate cancer",
    "influenza following skin cancer",
    "complications from cancer",
    "bone marrow cancer",
    "colorectal cancer",
    "lymph node cancer",
    "pancreatic cancer",
    "peritoneal cancer",
    "prostate cancer",
    "tracheal cancer",
    "bladder cancer",
    "stomach cancer",
    "throat cancer",
    "breast cancer",
    "spinal cancer",
    "liver cancer",
    "colon cancer",
    "brain cancer",
    "spine cancer",
    "bowel cancer",
    "lung cancer",
    "bone cancer",
    "neck cancer",
    "cancer",
    "chronic obstructive pulmonary disease",
    "complications from Alzheimer disease",
    "complications of Alzheimer disease",
    "coronary artery disease",
    "cardiovascular disease",
    "valvular heart disease",
    "pulmonary diseases",
    "Parkinson disease",
    "Alzheimer disease",
    "heart disease",
    "liver disease",
    "lung disease",
    "complications from a stroke",
    "multiple strokes",
    "a stroke",
    "stroke",
    "emphysema",
    "Possible heart attack",
    "valvular heart disease",
    "heart complications",
    "heart condition",
    "heart problems",
    "heart disease",
    "heart attack",
    "heart failur",
    "Lewy body dementia",
    "dementia",
    "complications of multiple sclerosis",
    "amyotrophic lateral sclerosis",
    "multiple sclerosis",
    "barbiturate overdose",
    "accidental overdose",
    "morphine overdose",
    "alcohol overdose",
    "heroin overdose",
    "drug overdose",
    "cardiac complications",
    "cardiac ailment",
    "cardiac arrest",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [21]:
# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "manager"].index], "info_3_1"] = ""

# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "administrator"].index], "info_3_1"] = ""

# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "Labrador"].index], "info_3_1"] = ""


<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [22]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [23]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25129 values in cause_of_death column.

CPU times: total: 27.1 s
Wall time: 27.3 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [24]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84760
2    12578
3      688
0        8
4        6
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [25]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [26]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [27]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if "osis" in df.loc[index, "info_3_1"]
#             #             and df.loc[index, "politics_govt_law"] == 0
#             #             and df.loc[index, "arts"] == 1
#             #             and df.loc[index, "sciences"] == 0
#             #             and df.loc[index, "spiritual"] == 0
#             #             and df.loc[index, "law_enf_military_operator"] == 0
#             #             and df.loc[index, "sports"] == 1
#             #             and df.loc[index, "event_record_other"] == 0
#             #             and df.loc[index, "crime"] == 0
#             #             and df.loc[index, "academia_humanities"] == 0
#             #             and df.loc[index, "business_farming"] == 0
#             #                         and df.loc[index, "other_species"] == 0
#             #             and df.loc[index, "num_categories"] == 0
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [28]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [29]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == "all"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [30]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "social commentator",
    "judge",
    "barrister",
    "Chiswick",
    "Industry",
    "railroad worker",
    "Kennedy Administration official",
    "elected official in Toledo",
    "government official",
    "public official",
    "CCPPC official",
    "behavioral economist",
    "political economist",
    "economist",
    "political scientist",
    "Natural Resources",
    "anti communist zealot who helped set the stage for McCarthyism",
    "anti communist",
    "communist",
    "three time president of the senate",
    "senator",
    "Tirunelveli",
    "Social Security",
    "Nicobar Islands",
    "Caicos Islands",
    "the Islands",
    "conspiracy theorist",
    "social theorist",
    "Forestry",
    "th districts",
    "Cultural Affairs",
    "Kennedy Administration official",
    "Drug Administration reviewer",
    "Administration Commission",
    "Drug Administration",
    "political prisoner",
    "Tourism",
    "Associate Justice of the Supreme Court of the",
    "Justice of the Michigan Supreme Court",
    "Justice",
    "Energy",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "artist",
    "printmaker",
    "dog trainer",
    "public speaker",
    "memoirist",
    "a promoter of culture",
    "hip hop promoter",
    "publicist",
    "humorist",
    "cabaret impresario",
    "impresario",
    "architect",
    "vocalist for Robin Trower",
    "vocalist",
    "Emmy winning documentarian",
    "documentarian",
    "talent agent",
    "theater production designer",
    "pinball machine designer",
    "album cover designer",
    "typeface designer",
    "interior designer",
    "fashion designer",
    "costume designer",
    "garden designer",
    "stage designer",
    "font designer",
    "set designer",
    "Roll Hall of Fame",
    "foreign correspondent",
    "correspondent",
    "stuntman",
    "recorded with Linda Ronstadt",
    "multitrack tape recording",
    "recording studio owner",
    "record label founder",
    "record label owner",
    "matchbox",
    "chairman of San Diego Comic Con International",
    "reality TV personality",
    "Internet personality",
    "theatre personality",
    "TV personality",
    "flag",
    "harmonica player",
    "bukkehorn player",
    "banjo player",
    "organ player",
    "bass player",
    "concertmaster",
    "newsreader",
    "magazine cartoonist",
    "cartoonist",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "coach",
    "executive",
    "sports official",
    "professional wrestler",
    "endurance runner",
    "runner",
    "member of the Pro Football Hall of Fame",
    "a member of the Baseball Hall of Fame",
    "Sports Hall of Fame",
    "FIBA Hall of Fame",
    "holder of nine world records",
    "world record holder for",
    "world record holder",
    "world champion kickboxer",
    "boxing referee",
    "boxer",
    "former Clemson football player",
    "table tennis player",
    "rugby union player",
    "ice hockey player",
    "baseball player",
    "handball player",
    "football player",
    "cricket player",
    "bridge player",
    "polo player",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "physician",
    "psychiatrist",
    "computer scientist",
    "computer engineer",
    "behavioral physiologist",
    "micropaleontologist",
    "plant pathologist",
    "parapsychologist",
    "pharmacologist",
    "epileptologist",
    "copepodologist",
    "epistemologist",
    "meteorologist",
    "criminologist",
    "methodologist",
    "ornithologist",
    "haematologist",
    "entomologist",
    "neurologist",
    "cosmologist",
    "pathologist",
    "serologist",
    "sexologist",
    "ufologist",
    "ecologist",
    "clinical psychologist",
    "petrologist",
    "neuroscientist",
    "neurologist",
    "designer of GPS",
    "Higgs boson theorist",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "inventor of the fast food system",
    "investment banker",
    "banker",
    'Apple as the "computer for the rest of us"',
    "restaurateur",
    "farmer",
    "multi millionaire industrialist",
    "industrialist",
    "Director of the Institute of Business Administration",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "teacher",
    "inventor of Cued Speech",
    "Sciences of",
    "anthropologist",
    "anthropologist",
    "archaeologist",
    "paremiologist",
    "japanologist",
    "philologist",
    "museologist",
    "indologist",
    "sinologist",
    "evolutionary theorist",
    "intellectual",
    "linguist",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "cryptologist",
    "intelligence officer",
    "military officer",
    "police officer",
    "army officer",
    "Army officer",
    "veteran of World War II",
    "veteran of World War I",
    "World War II veteran",
    "World War I veteran",
    "oldest WWII veteran",
    "war veteran",
    "Atmospheric Administration Commissioned Officer Corps",
    "Colditz prisoner",
    "prisoner of war",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "motivational speaker",
    "gospel",
    "Anglican priest",
    "Wiccan priest",
    "Jesuit priest",
    "priestess",
    "priest",
    "archbishop of the Anglican Church of Papua New",
    "the Cayman Islands",
    "Brecon",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "humanitarian",
    "social worker",
    "project designer in the Boy Scouts of",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "convicted fraudster",
    "convicted terrorist",
    "convicted criminal",
    "convicted perjurer",
    "convicted rapist",
    "extortionist",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "documentary subject",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "hypertension",
    "acute myeloid leukemia",
    "leukemia",
    "stabbed",
    "run motorcycle accident",
    "run car accident",
    "run car crash",
    "run accident",
    "murdered by the Lukashenko regime",
    "murdered",
    "liver related problems",
    "respiratory problems",
    "COPD",
    "shot",
    "pancreatitis",
    "encephalitis",
    "hepatitis B",
    "hepatitis C",
    "peritonitis",
    "meningitis",
    "hepatitis",
    "sepsis",
    "lung ailments",
    "lung ailment",
    "lung transplant surgery",
    "lung infections",
    "lung condition",
    "urinary tract infection",
    "brain infection",
    "chest infection",
    "lung infections",
    "liver infection",
    "tuberculosis",
    "cirrhosis",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [31]:
# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "Technology"].index], "info_3_1"] = ""

# Dropping info_3_1 value for entries with duplicate category value
df.loc[
    [
        index
        for index in df[df["info_3_1"].notna()].index
        if "Science" in df.loc[index, "info_3_1"]
    ],
    "info_3_1",
] = ""

# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "the"].index], "info_3_1"] = ""

# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "promoter"].index], "info_3_1"] = ""

# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "s"].index], "info_3_1"] = ""

# Dropping info_3_1 value for entries with duplicate category value
df.loc[[index for index in df[df["info_3_1"] == "all"].index], "info_3_1"] = ""


<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [32]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [33]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25134 values in cause_of_death column.

CPU times: total: 20.4 s
Wall time: 20.9 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [34]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84685
2    12607
3      734
0        8
4        6
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [35]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [36]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [37]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if "cardinal" in df.loc[index, "info_3_1"]
#             #             and df.loc[index, "politics_govt_law"] == 0
#             #             and df.loc[index, "arts"] == 0
#             #             and df.loc[index, "sciences"] == 0
#             #             and df.loc[index, "spiritual"] == 0
#             #             and df.loc[index, "law_enf_military_operator"] == 0
#             #             and df.loc[index, "sports"] == 0
#             #             and df.loc[index, "event_record_other"] == 0
#             #             and df.loc[index, "crime"] == 0
#             #             and df.loc[index, "academia_humanities"] == 0
#             #             and df.loc[index, "business_farming"] == 0
#             #             and df.loc[index, "other_species"] == 0
#             #             and df.loc[index, "num_categories"] == 0
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [38]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [39]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == "cardinal"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [40]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "political corruption was the subject of the Flood Tribunal",
    "leader of main political opposition party",
    "geopolitical energy specialist",
    "political consultant",
    "political theorist",
    "political analyst",
    "political figure",
    "political",
    "missile development",
    "development",
    "delegate to the National People Congress",
    "delegate to the Arab League",
    "three time MP",
    "MP",
    "Planning",
    "pacifist",
    "Agriculture Organization",
    "presidential primary candidate",
    "presidential candidate",
    "presidential adviser",
    "Vice Minister of the Ministry of Water Resources",
    "Executive Vice Mayor of Beijing",
    "Vice Presidential nominee",
    "Vice Minister of the Ministry of Water Resources",
    "Prime Minister of the Donetsk People Republic",
    "acting Minister of Internal Affairs of",
    "Consumer Protection Minister",
    "Minister for East Affairs",
    "Islamic Affairs Minister",
    "nine term Prime Minister",
    "Federalism Minister of a",
    "Minister of Employment",
    "Deputy Prime Minister",
    "Prime Minister of the",
    "Minister of Borders",
    "Prime Minister",
    "Health",
    "founder of Drachmi Democratic Movement Five Stars",
    "founder of the National Regeneration Movement",
    "founder of the Council for a Beautiful",
    "co founder of Jatiya Samajtantrik Dal",
    "founder of the Progress Party",
    "co founder of the Mass Party",
    "founder of Occupy Pedophilia",
    "founder of Common Cause",
    "co founder of the PSDB",
    "co founder of the Gapminder Foundation",
    "natural resources",
    "Administration",
    "minister of agriculture",
    "Agriculture Organization",
    "defector",
    "first child of civil rights leader Martin Luther King Jr",
    "leader of main political opposition party",
    "widow of fascist leader Oswald Mosley",
    "leader of the Mothers of Srebrenica",
    "leader of the Independent Party",
    "prominent Pashtun leader",
    "revolutionary leader",
    "secessionist leader",
    "civil rights leader",
    "traditional leader",
    "peasant leader",
    "tribal leader",
    "labor leader",
    "reformed Ku Klux Klan leader",
    "community leader",
    "Consumer Affairs",
    "Clerk of the Crown in Chancery",
    "Nobel Peace Prize winner",
    "Landtag of North Rhine Westphalia",
    "chairman of the Communist Party of",
    "chairman of Memorial",
    "chairman of the Ocean Affairs Council",
    "the United Nations Development Program",
    "Urban Development",
    "Rural Development",
    "Development",
    "Toronto city councillor",
    "the General Secretary of the National Union of Railwaymen",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "commentator",
    "a pioneer of electronic music",
    "early music specialist",
    "popular music",
    "music",
    "drama",
    "subject of the Oscar winning animated short",
    "animator",
    "storyteller",
    "cinematographer",
    "bibliographer",
    "mastering engineer",
    "satirist",
    "CBS Vice President",
    "game show panelist",
    "panelist",
    "narrator of the PBS series",
    "narrator",
    "co founder of the Chicago Surrealist Group",
    "co founder of The Mamas & the Papas",
    "co founder of Disneyland Records",
    "founder of Food Network",
    "co founder of magazine",
    "founder of fratricide",
    "founder of Apotex",
    "camera operator",
    "silk screen printer",
    "screenplays",
    "leader of the Philharmonia Orchestra",
    "Walk of Fame",
    "Pullitzer Prize winner",
    "Pulitzer Prize winner",
    "pageant winner",
    "Grammy winner",
    "Tony winner",
    "Prince Claus Award winner",
    "clavichordist",
    "Gracie Allen",
    "chairman of Sanlih E Television",
    "Marie Curie",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "official",
    "trainer",
    "medical delegate",
    "bodybuilder",
    "co founder of the Marathon",
    "founder of Gresini Racing",
    "Commonwealth Games gold medallist",
    "championships bronze medalist",
    "Commonwealth silver medalist",
    "bronze medallist",
    "silver medallist",
    "bronze medalist",
    "silver medalist",
    "expedition leader",
    "raced Triple Crown champion",
    "two time Boat Race winner",
    "chairman of the All Jumping Course",
    "chairman of Polo",
    "World Boxing Council light heavyweight champion",
    "WBC world light middleweight champion",
    "WBC Bantamweight Champion",
    "WBC heavyweight champion",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "inventor",
    "medical researcher",
    "conspiracy theory researcher",
    "designer",
    "co founder of the CSI",
    "founder of the NASL",
    "proponent of natural childbirth",
    "naturalist",
    "the physical properties of materials",
    "physics Nobel prize winner",
    "geophysicist",
    "physicist",
    "physics Nobel prize winner",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "hotelier",
    "co founder of INSEAD",
    "founder of Berkley",
    "co founder of Pat King of Steaks cheesesteak emporium",
    "founder of R J Corman Railroad Group",
    "co founder of Bing Lee superstores",
    "founder of the Hyundai Group",
    "co founder of Wanxiang",
    "co founder of Garmin",
    "founder of Biovail",
    "chairman of Airways from to",
    "chairman of Imasco",
    "chairman of NEC",
    "chairman of off price clothier SYMS",
    "chairman of The Leela Palaces",
    "chairman of Intel Corporation",
    "chairman of The Swatch Group",
    "chairman of Trident Seafoods",
    "chairman of Sandals Resorts",
    "chairman of Biltmore Farms",
    "chairman of Banco Safra",
    "chairman of Chen Hsong",
    "chairman of Alltech",
    "chairman of IDG",
    "Santa Fe Railway",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "literary theorist",
    "theorist",
    "lexicographer",
    "epigraphist",
    "geographer",
    "Vice Chancellor of University of Oxford",
    "founder of Project Gutenberg",
    "librarian",
    "folklorist",
    "a champion of culture",
    "head of the Tamarind Institute at the University of New",
    "Vice Chancellor of University of Oxford",
    "chancellor of Sathyabama University",
    "president of Brandeis University",
    "Rector of the Central University",
    "president of Emory University",
    "Warden of Durham University",
    "Wichita State University",
    "the University of Iowa",
    "Temple University",
    "State University",
    "Akal University",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "cryptographer",
    "Minister of National Defence",
    "Defense Minister of South",
    "Defense Minister",
    "founder of militant organization Fatah",
    "founder of the Experimental Aircraft Association",
    "founder of the Police Force",
    "leader of the Provisional Republican Army",
    "leader of Forças Populares de Abril",
    "the second highest leader of Hamas",
    "former leader of PCAA",
    "leader of Tehreek e Nafaz e Shariat e Mohammadi",
    "son of a  who became a rebel leader",
    "leader of the PFLP GC",
    "paramilitary leader",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "Galloway",
    "religious leader",
    "sect leader",
    "Catholic religious leader",
    "ecumenical leader",
    "chairman of the Bishops' Conference",
    "Lismore",
    "self help consultant",
    "self help",
    "cardinal",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "founder of Neville Fernando Teaching Hospital",
    "founder of the Bandhua Mukti Morcha",
    "commune founder",
    "a leader in the pro  movement",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = ["war criminal", "criminal", "cannibal"]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "the Preakness Stakes",
    "Blue Grass Stakes",
    "Preakness Stakes",
    "Champagne Stakes",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "run",
    "injuries sustained in a fall",
    "fall",
    "subsequently found dead",
    "is presumed dead",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [41]:
# Dropping info_3_1 value for entry with redundant category value
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Jacques_Carelman"].index, "info_3_1"
] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [42]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [43]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25134 values in cause_of_death column.

CPU times: total: 18.7 s
Wall time: 19 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [44]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84646
2    12624
3      755
0        8
4        7
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [45]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [46]:
# # Code to check each value
# roles_cause_list.pop()

<IPython.core.display.Javascript object>

In [47]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if "mixed doubles" in df.loc[index, "info_3_1"]
#             #             and df.loc[index, "politics_govt_law"] == 0
#             #             and df.loc[index, "arts"] == 0
#             #             and df.loc[index, "sciences"] == 0
#             #             and df.loc[index, "spiritual"] == 0
#             #             and df.loc[index, "law_enf_military_operator"] == 0
#             #             and df.loc[index, "sports"] == 0
#             #             and df.loc[index, "event_record_other"] == 0
#             #             and df.loc[index, "crime"] == 0
#             #             and df.loc[index, "academia_humanities"] == 0
#             #             and df.loc[index, "business_farming"] == 0
#             #                         and df.loc[index, "other_species"] == 0
#             #             and df.loc[index, "num_categories"] == 0
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [48]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [49]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == "mixed doubles"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [50]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Vice President",
    "first president of the Republic of Northern",
    "president of the Chamber of Deputies",
    "president of the Democratic Union",
    "son of president Hafez al Assad",
    "an advisor to several presidents",
    "Gerakan Party founding president",
    "president of Johnny & Associates",
    "president of Research Council",
    "founding member of Amnesty International",
    "member of the Assembly of Experts",
    "Legislative Assembly member",
    "member of Parliament",
    "member of the CAE",
    "cabinet member",
    "military junta member",
    "lesbian policy",
    "employment",
    "conscription",
    "Pro President of City State",
    "President of Biafra",
    "Guidance Department of the Workers' Party of Korea",
    "last Socialist Party of mayor of a major city",
    "Communist Party Chief of Henan province",
    "the Earth Party",
    "Freedoms Party",
    "Life Party",
    "neo",
    "Mid Kent",
    "government co minister in the internal settlement government of",
    "minister in the Whitlam government",
    "advisor to two prime ministers",
    "twice minister of the interior",
    "twice deputy prime minister",
    "former foreign minister",
    "minister of Education",
    "deputy prime minister",
    "foreign minister of",
    "government minister",
    "cabinet minister",
    "Central Fife",
    "expert of affairs",
    "founding member of Amnesty International",
    "AM",
    "last Socialist Party of mayor of a major city",
    "former mayor of Bay City",
    "press secretary for Ronald Reagan",
    "under secretary of the Congregation for the Evangelization of Peoples",
    "secretary in charge of nuclear weapons",
    "personal secretary of Saddam Hussein",
    "conscientious objector",
    "first president of the Republic of Northern",
    "jurist",
    "Leader Of Islami Shashontantra Andolan",
    "supervisor of elections",
    "Commerce",
    "Guidance Department of the Workers' Party of Korea",
    "last Nawab of Pataudi",
    "Apostolic Nuncio to the",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "acting",
    "Frances Dee",
    "member of the Rock Steady Crew",
    "Billie Burke",
    "popularizer of the Chicken Dance",
    "Life President of the Sydney International Piano Competition",
    "draughtswoman",
    "CEO of Warner Bros Records",
    "CEO of Ruth Eckerd Hall",
    "CEO of EMI Worldwide",
    "CEO of Off White",
    "CEO of Fox News",
    "CEO of NBC",
    "Tony Orlando",
    "John Coltrane",
    "creator of Sudoku",
    "chef who worked as Michael Barry",
    "master chef",
    "chef",
    "blues saxophonist",
    "saxophonist",
    "Helene Weigel",
    "nightclub owner",
    "gallery owner",
    "club owner",
    "librettist",
    "Larbey",
    "caricaturist",
    "WABC TV anchorman",
    "news anchor",
    "Warner Bros shorts from his tenure at",
    "the Daleks",
    'David Letterman sidekick known as Larry "Bud" Melman',
    "dancing partner of Buddy Ebsen",
    'Bows"',
    "Wood Quay",
    "illustrated",
    "arts patron",
    "horticulturalist",
    "horticulturist",
    "proponent of Odissi dance",
    "dance",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "player",
    "president of the Lancashire County Cricket Club",
    "President of Cricket",
    "ARU president",
    "member of the IOC",
    "former WRU President",
    "Timberwolves",
    "owner breeder of thoroughbred race horses",
    "former owner of Baseball Astros",
    "owner of the Toronto Blue Jays",
    "owner of the Buffalo Sabres",
    "owner of San Antonio Spurs",
    "football franchise owner",
    "sports franchise owner",
    "football team owner",
    "sports team owner",
    "racehorse owner",
    "horse owner",
    "team owner",
    "Grand Prix race car driver",
    "race car driver",
    "helped found Wrestling",
    "rugby league",
    "Detroit Tigers between and",
    "cricket for",
    "cricketer",
    "Leigh",
    "cc World Championships",
    "golfer",
    "named Lake Vostok",
    "mixed doubles",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "researcher",
    "engineer",
    "president of NASSCOM",
    "discoverer of LSD",
    "Yngling class",
    "Nobel prize laureate",
    "Nobel laureate",
    "Ted Hughes",
    "Production Space Center",
    "principal investigator for STIS on the Hubble Space Telescope",
    "the Marshall Space Flight Center",
    "Life breakfast cereals",
    "moths",
    "processes",
    "the term AI",
    "the Unix operating system",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "retailer",
    "President of Jerónimo Martins",
    "CEO of MVM Group",
    "former CEO of Wegmans Food Markets",
    "CEO of Scholastic Corporation",
    "CEO of Marriott International",
    "CEO of the Ford Motor Company",
    "CEO of Pacific Lumber Company",
    "CEO of International Group",
    "CEO of the Ford Foundation",
    "CEO of Braun Corporation",
    "CEO of Drummond Company",
    "CEO of Alaska Airlines",
    "CEO of Ashland Oil Inc",
    "CEO of General Motors",
    "CEO of ZeniMax Media",
    "CEO of Merrill Lynch",
    "CEO of Bear Stearns",
    "CEO of ITC Limited",
    "CEO of Jaguar Cars",
    "CEO of Paul Stuart",
    "CEO of Roadhouse",
    "CEO of Starbucks",
    "CEO of Trafigura",
    "CEO of Cintas",
    "CEO of Qualys",
    "CEO of Amtrak",
    "CEO of Vitol",
    "CEO of Atari",
    "CEO of Sony",
    "CEO of AOL",
    "CEO of IBM",
    "pioneer of low cost airlines",
    "owner of Parma Calcio",
    "owner of Tut By",
    "Anna Nicole Smith stepson",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "preservationist",
    "president of the Union Nationale Inter universitaire",
    "president of multiple colleges",
    "the genetic classification of languages",
    "Chancellor of UC Berkeley",
    "chair of the Wordsworth Trust",
    "contributor to the Auschwitz Protocol",
    "Women Studies pioneer",
    "supporter of Esperanto",
    "archivist",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "resistance member during World War II",
    "member of the Joint Chiefs of Staff",
    "Order of recipient",
    "knight of the Military Order of William",
    "noted PoW",
    "defence minister",
    "WRAF",
    "pilot of the world smallest jet",
    "defence secretary of",
    "Aviation",
    "Cornwall Constabulary",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "two term president of the Southern Baptist Convention from to",
    "Humanistic Judaism movement",
    "Grand Master of the Sovereign Military Order of",
    "Baptist minister",
    "fundamentalist Christian minister",
    "baptist minister",
    "the Gulf",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "founding member of World Wide Fund for Nature",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = ["world second oldest living person"]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = ["Hurricane Katrina", "Preakness winner"]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "stomach aneurysm",
    "lymphoma",
    "melanoma",
    "coma",
    "anti depressants",
    "stomach aneurysm",
    "rupture of the pancreas",
    "injuries from a car accident",
    "neck injuries",
    "killed at a midnight Christmas Mass",
    "killed during the Second Intifada",
    "killed in",
    "scleroderma",
    "septic shock",
    "complications from Alzheimer",
    "drug related complications",
    "respiratory complications",
    "complications from",
    "hypothermia",
    "torture",
    "benzodiazepine intoxication",
    "alcohol intoxication",
    "drug intoxication",
    "blood clots",
    "pulmonary hemorrhage",
    "brain hemorrhage",
    "ran over",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [51]:
# Dropping info_3_1 value for entries with redundant category value
df.loc[
    [
        index
        for index in df[df["info_3_1"].notna()].index
        if "culture" in df.loc[index, "info_3_1"]
    ],
    "info_3_1",
] = ""

# Dropping info_3_1 value for entries with redundant category value
df.loc[
    [
        index
        for index in df[df["info_3_1"].notna()].index
        if "founder" in df.loc[index, "info_3_1"]
    ],
    "info_3_1",
] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [52]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [53]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25136 values in cause_of_death column.

CPU times: total: 21.2 s
Wall time: 21.6 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [54]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84625
2    12624
3      776
0        8
4        7
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [811]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [810]:
# # Code to check each value
# value = roles_cause_list.pop()
# value

<IPython.core.display.Javascript object>

In [809]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if value in df.loc[index, "info_3_1"]
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [806]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [805]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [812]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Golders Green",
    "Zita of Bourbon Parma",
    "Public Policy Center",
    "Chairman of Millbrook First Nation",
    "Radnorshire",
    "Pacific Affairs",
    "Press Secretary to John Fitzgerald Kennedy",
    "key Watergate figure",
    "Social Welfare",
    "eventual nuclear disarmament advocate",
    "women rights advocate",
    "women rights",
    "Clinton",
    "former Secretary of State",
    "governor general of",
    "anti prohibitionist",
    "senior counsel",
    "Commander in Chief of the",
    "Marine Resources",
    "Information",
    "human rights campaigner",
    "Isleworth",
    "Refugee Board of",
    "Rajasthan",
    "mother of Benazir Bhutto",
    "Customs",
    "Welfare",
    "Heston",
    "Castleford",
    "Inverclyde",
    "Territories",
    "Prevention",
    "Traditional Affairs",
    "Eastern Stirlingshire",
    "Fife",
    "revolutionist of the independence movement",
    "Black Rod",
    "Recreation",
    "Tipton",
    "three time governor of",
    "governor of",
    "dictator",
    "Lord Advocate",
    "Treasury Solicitor",
    "Governor General of",
    "Hebei",
    "SACP",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "showman",
    "ballet master",
    "Twitter",
    "miniseries",
    "amateur illusionist",
    "nightclub proprietor",
    "Guitar Company",
    "Bad Writing Contest",
    '"Fergus Lamont"',
    "Mayor Phineas T Bluster",
    "ethnic community advocate",
    "Blake Prize",
    "smooth countrypolitan stylist of the s",
    "Sun Records",
    "Split Enz",
    "concert violinist",
    "violinist",
    "Eddie Albert",
    "former wife of Keith Moon",
    "partner of Sir Noël Coward",
    "widow of Cyril Fletcher",
    "draughtsman",
    "maestro",
    "speaker",
    "postage stamps",
    "D expert",
    "announcer",
    "The Rolling Stones to the",
    "art connoisseur",
    "planner",
    "embroiderer",
    "Cinema",
    "special effects supervisor",
    "George Burns",
    "tenor",
    "public relations expert",
    "multi instrumentalist",
    "instrumentalist",
    "frontman of",
    "soprano",
    "father of Prince",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "international master",
    "Washington Capitals",
    "Lions",
    "VJ",
    "breeder",
    "Tour de",
    "Northern",
    "Dallas Cowboys",
    "national team",
    "Sports",
    "Sport",
    "commissioner",
    "speedway",
    "referee",
    "brother of PM Justin Trudeau",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "Telecommunications",
    "Tang drink mix",
    "doctor",
    "was at the forefront of work on Read Only Memory",
    "pioneer work on Stellar nucleosynthesis",
    "rehabilitation of disabled children",
    "hypnotist",
    "parasitology",
    "probability theory",
    "nutritionist",
    "research scientist",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "stock broker",
    "Pan Am",
    "Ovation",
    "Chairman of the Board at General Motors Corporation",
    "Chairman of the Chicago Board Options Exchange",
    "Chairman of Ford Motor Company",
    "Chairman of City National Bank",
    "Chairman of Johnson & Johnson",
    "Chairman of the Sowind Group",
    "Chairman of Target",
    "Chairman of Mobil",
    "Segway",
    "Pebble Beach",
    "CEO of the",
    "David Attenborough",
    "Chairman of Ford Motor Company",
    "financier",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "Hebrew literature",
    "Chairman of the Befaqul Madarisil Arabia",
    "first dictionary",
    "Life",
    "Director of the National Air",
    "preservation of endangered aboriginal languages",
    "the Academy of Arts",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "spymaster",
    "commander of the Selous Scouts",
    "Times war",
    "Tactics",
    "War flight nurse",
    "Purple Heart",
    "flying ace",
    "Analysis Wing",
    "Army Materiel Command",
    "recovery effort after the September attacks",
    "typed Oskar Schindler list",
    "Hero of the Federation",
    "Medal of Honor recipient",
    "WWII Air Force pilot",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "New Testament",
    'Liberation" Catholic youth movement',
    "Archdeacon of Portsmouth",
    "Fili",
    "astrologer",
    "Archbishop of Maringá",
    "Archbishop of Toronto",
    "archbishop",
    "bishop",
    "Veliki Preslav",
    "Titular Bishop of Flenucleta",
    "Chakma raja",
    "reformer",
    "Edmonton",
    "muhaddith",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = ["matron of Glasgow Victoria Infirmary"]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = ["illegal oil broker", "gangster"]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "involved in Raymond Caruana murder",
    "oldest person ever born in",
    "oldest ever citizen",
    "inspiration for the movie character shootout",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "Christmas Hurdle",
    "terrier mix",
    "mascot for the Trenton Thunder",
    "Breeders' Cup Turf",
    "broodmare",
    "Prix Vermeille",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "Stevens Johnson syndrome",
    "or cerebral edema",
    "pulmonary edema",
    "drowned",
    "blunt force trauma",
    "lacerations",
    "morphine",
    "coronary artery plaque",
    "Valium",
    "a broken hip",
    "oxycodone",
    "myelodysplastic syndromes",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [813]:
# Dropping info_3_1 value for entries with duplicate category value
df.loc[
    [
        index
        for index in df[df["info_3_1"].notna()].index
        if "creator" in df.loc[index, "info_3_1"]
    ],
    "info_3_1",
] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [814]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [815]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25136 values in cause_of_death column.

CPU times: total: 22.2 s
Wall time: 23.8 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [816]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84607
2    12630
3      788
0        8
4        7
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [818]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1669]:
# # Code to check each value
# value = roles_cause_list.pop()
# value

<IPython.core.display.Javascript object>

In [1670]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if value in df.loc[index, "info_3_1"]
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1671]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1672]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [1674]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Promotion of Employment",
    "Employment",
    "Sale East",
    "Levies",
    "United Nations",
    "ZANLA",
    "OPEC",
    "Chief Judge of the Supreme Court",
    "Wadiyar dynasty",
    "South Tyrone",
    "Marxist",
    "Services",
    "President of",
    "sister of Robert Mugabe",
    "MSP for Strathkelvin",
    "Middleton",
    "public health expert",
    "Hillsborough",
    "Dublin West",
    "Poplar",
    "legalisation campaigner",
    "campaigner",
    "Arthabaska",
    "Loudoun",
    "Thrace",
    "trans nations builder",
    "Chatham",
    "Exchange Commission",
    "Royton",
    "Bohus",
    "Food",
    "Natural Gas",
    "Nixon Enemy",
    "Redditch",
    "Public Service Reform",
    "NTUC",
    "civil rights litigator",
    "aristocrat",
    "zionist",
    "Minister",
    "Native rights",
    "Commercial Workers Int'l",
    "segregationist",
    "legislator",
    "animal rights advocate",
    "civil rights advocat",
    "Solicitor General of the",
    "Elena Ceaușescu",
    "courtier",
    "pretender to the throne",
    "labor organizer",
    "banking regulator",
    "State Treasurer for West Virginia",
    "economic affairs",
    "Urban Habitat",
    "gay rights campaigner",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "piper",
    "Dean",
    "other game shows",
    "mother of Daniel Day Lewis",
    "father of Nicolas Cage",
    "piano",
    "Premio Campiello recipient",
    "humourist",
    "stage practitioner",
    "CEO of",
    "muralist",
    "beauty queen",
    "clown",
    "Roses",
    "won nine Grammy Awards",
    "Cologne",
    "the Road Runner",
    "antiquities dealer",
    "orator",
    "auctioneer",
    "TV scripts",
    "radio",
    "baritone",
    "Eva Gabor",
    "stamp auctioneer",
    "advocate of the arts",
    "art supporter",
    "Sci Fi movies",
    "Charlie Parker",
    "furniture maker",
    "Apollo Theater",
    "De Facto",
    "companion of Gore Vidal",
    "drummer",
    "harpsichordist",
    "non fiction on Native s",
    '" Mogul"',
    "Ella Fitzgerald",
    "magazine covers",
    "produced Top albums",
    "six time Academy Awards nominee",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "surfer",
    "Giro d'Italia in",
    "waterskier",
    "sailmaker",
    "alpinist",
    "long jumper",
    "father of Hugh Laurie",
    "athlete",
    "javelin thrower",
    "mountaineer",
    "ski orienteer",
    "the Philadelphia Phillies",
    "Mike Tyson",
    "world champion",
    "climber",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "chemist",
    "wear fabrics",
    "assigned use of @ sign",
    "motorcycle builder",
    "paranormalist",
    "the Executive Director of the Gerontology Research Group",
    "the folding ping pong table",
    "the Nanchang J",
    "pediatrician",
    "statistics",
    "Carboniferous periods",
    "patent holder",
    'coined the term "transistor"',
    "design scientist",
    "color vision",
    "chloroplast biology",
    "quantum optics",
    "fats",
    "the likelihood principle",
    "solar plasma spectroscopy",
    "fractals",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "Esprit",
    "Goodyear",
    "billionaire",
    "Popsicles",
    "restaurant proprietor",
    "corporate raider",
    "viticulturist",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "semiotician",
    "theatre instructor",
    "instructor",
    "language",
    "a specialist in the Berber languages",
    "Heritage Center",
    "reading advocate",
    "cultural analysis",
    "Syriacist",
    "an expert on Pascal",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "Atmospheric ed Officer Corps",
    "G I Joe",
    "Chief of the Navy General Staff",
    "Border Patrol",
    "missions",
    "Defence Staff Chief",
    "mercenary",
    "Army Ranger",
    "linked to al Qaeda",
    "war hero",
    "aquanaut",
    "CIA operative",
    "Al Itihaad al Islamiya",
    "test pilot",
    "militant for the independence of",
    "Federal Bureau of Investigation informer",
    "World War II OSS operative",
    "Management at the U.S. Department of Defense",
    "a confessed spy for the KGB",
    "Corregidor",
    "last known person to see Anne Franke",
    "World War II war",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "Furness",
    "Malankara Metropolitan",
    "The Isles",
    "All",
    "Eastern",
    "Ossory",
    "Biblical revisionist",
    "Byblos",
    "Bishop of Nitra",
    "creationist",
    "father of Christian Reconstructionism",
    "Islamic",
    "rabbi",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = ["the Odebrecht Foundation", "Chicago", "Family Centres"]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "loan crisis",
    "firearms possession",
    "crimes against humanity during the Civil War",
    "rapist",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "supercentenarian",
    "renounced the office of",
    "father of Balthazar Getty",
    "envoy to the Republic",
    "yoga adept",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = ["Racehorse of the Year"]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "exhaustion",
    "hoof inflammation",
    "helmet to shield others from a grenade explosion",
    "methyl alcohol poisoning",
    "acute respiratory distress syndrome",
    "craniocerebral trauma",
    "asthma",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1675]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [1676]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25136 values in cause_of_death column.

CPU times: total: 17.5 s
Wall time: 18.1 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [1677]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84577
2    12639
3      808
0        8
4        8
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [2604]:
# # Obtaining values for column and their counts
# roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [2606]:
# # Code to check each value
# value = roles_cause_list.pop()
# value

<IPython.core.display.Javascript object>

In [2607]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3_1"].notna()].index
#             if value in df.loc[index, "info_3_1"]
#         ],
#         "info_3_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [2608]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [2609]:
# # Example code to quick-check a specific entry
# df[df["info_3_1"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [2610]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Fine Arts",
    "MLA for Haidergarh",
    "Skegness",
    "lobbyist",
    "Home Affairs",
    "nd Districts",
    "Receiver General of Massachusetts",
    "New",
    "Industrial Research",
    "International Security Affairs",
    "Wisconsin State Assembly",
    "government advisor",
    "Korea",
    "Mines",
    "Legal Affairs Commission",
    "IAWG",
    "Nicobar Island",
    "Patagonia",
    "Livestock",
    "Nevis",
    "counsel",
    "Wallington",
    "Family Affairs",
    "Camborne",
    "Constitutional Convention",
    "Industrial Research",
    "Research",
    "Aged Care",
    "Spenborough",
    "Spen",
    "Arab tolerance",
    "East Perthshire",
    "social insurance",
    "information",
    "sports",
    "Bihar MLC",
    "shipping",
    "energy",
    "mineral resources",
    "treasury solicitor",
    "emigrants",
    "Social Council",
    "human services",
    "public affairs",
    "industry",
    "family welfare",
    "monopolies",
    "co prince of",
    "treasurer of Queensland",
    "the International Center on Nonviolent Conflict",
    "whistleblower",
    "three term TD",
    "environment protection",
    "urbanism",
    "Sarawak",
    "tourism",
    "science",
    "foreign affairs of the PDNC",
    "social security",
    "plenipotentiary of",
    "signatory of the Act of the Re Establishment of the State of",
    "commerce",
    "Chippewa Indians",
    "literacy",
    "coiner of country name",
    "Haryana High Court",
    "Mass Education",
    "Drugs",
    "Acting Surgeon General of the",
    "Democracy",
    "Co Prince of",
    "Leader of the Opposition",
    "chief ideologist of Khmer Rouge",
    "Treasurer of UKIP",
    "Social Affairs",
    "Deputy Speaker of Parliament",
    "animal rights",
    "Political Rights Association",
    "Transportation",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "mixer",
    "organist",
    "flautist",
    "society figure",
    "gossip celebrity",
    "trombonist",
    "personal assistant to",
    "art",
    "Colossus Records",
    "Imagineer",
    "cook",
    "gardener",
    "talk show host",
    "clarinetist",
    "book binder",
    "talkback host",
    "accordionist",
    "televangelist",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "sport fisherman",
    "Commonwealth Games champion",
    "sportsman",
    "larger than life persona",
    "Thoroughbred racehorse",
    "Colorado Avalanche",
    "football",
    "World Hockey Association",
    "Rushden & Diamonds F C",
    "jetpack pilot",
    "Atlanta Braves",
    "racehorse",
    "Open women doubles champion",
    "the Washington Wizards",
    "lawn bowler",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "asteroids",
    "the Arctic",
    "soil science",
    "HQ Trivia",
    "LBO",
    "the Draper Prize",
    "pharmacist",
    "builder",
    "electron spin resonance spectroscopy",
    "organ transplantation",
    "chemotherapy",
    "medical scientist",
    "scientist",
    "discoverer of REM sleep",
    "statistician",
    "medical scientist",
    "medical",
    "Academy of Engineering",
    "Yaogan satellites",
    "first flight simulator",
    "Computational Chemists",
    "developer of the image scanner",
    "engine",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "HNA Group",
    "Semapa",
    "the CITIC Group",
    "marketer",
    "Yulon",
    "Roland Corporation",
    "grazier",
    "SGV & Company",
    "real estate developer",
    "chairman of Primark",
    "EasyJet",
    "Sun International",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "design at the Museum of Modern Art",
    "Education Center",
    "CQUT",
    "orientalist",
    "celebrated the evolution of type",
    "Albert Museum",
    "president",
    "Tape Archive",
    "archdruid",
    "dean of the Padma Seshadri Bala Bhavan",
    "Design",
    "Surgeons",
    "study",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "Artillery",
    "recovery following",
    "adviser to the National Security Council",
    "Springhill mining disasters",
    "CICPC investigator",
    "PIRA",
    "the No Fighter Squadrons",
    "spy",
    "paramilitary",
    "aviator",
    "Guangzhou Military Region Air Forces",
    "Marshal of the Union",
    "tank commander",
    "Security Service",
    "Air Defence Forces Army",
    "Artillery",
    "chief of Haro lockdown",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "theistic evolutionist",
    "Edinburgh",
    "Wells",
    "Mashonaland",
    "pagan revivalist",
    "Jews",
    "Kilmacduagh",
    "Pihopa o Aotearoa",
    "Opus Dei",
    "preacher",
    "esotericist",
    "Forzol",
    "Fort de",
    "Brighton",
    "Herzegovina",
    "Lorraine",
    "Slutsk",
    "Prizren",
    "Chukotka",
    "St Vincent",
    "Kuruman",
    "Raphoe",
    "Minneapolis",
    "Ipswich",
    "Suzdal",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = ["informant", "human rights violator", "embezzlement", "kidnapper"]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "last living verified person born in the th century",
    "kidnap victim",
    "parental rights case",
    "imprisoned by",
    "murder",
    "the Siege of Sarajevo",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = ["Ascot Gold Cup", "Derby winner", "hatched in captivity"]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = [
    "ALS",
    "esophageal dysphagia",
    "swine flu",
    "brain trauma",
    "respiratory stress syndrome",
    "chronic traumatic encephalopathy",
    "strangled",
    "brutality",
    "traffic collision",
]
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [2611]:
# Hard-coding cause_of_death for entry
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Regis_Korchinski-Paquet"].index,
    "cause_of_death",
] = "possible police interference"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [2612]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [2613]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 25137 values in cause_of_death column.

CPU times: total: 21.9 s
Wall time: 22.9 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [2614]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84567
2    12636
3      820
4        9
0        8
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [811]:
# Obtaining values for column and their counts
roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [810]:
# Code to check each value
value = roles_cause_list.pop()
value

<IPython.core.display.Javascript object>

In [809]:
# Create specific_roles_cause_list for above popped value
specific_roles_cause_list = (
    df.loc[
        [
            index
            for index in df[df["info_3_1"].notna()].index
            if value in df.loc[index, "info_3_1"]
        ],
        "info_3_1",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [806]:
# Viewing list sorted by descending length to copy to dictionary below and screen values
sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [805]:
# Example code to quick-check a specific entry
df[df["info_3_1"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,

}

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [None]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

#### Checking Updated `num_categories` Value Counts

In [None]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

In [1678]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles and `cause_of_death` in `info_3_1`

In [811]:
# Obtaining values for column and their counts
roles_cause_list = df["info_3_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [810]:
# Code to check each value
value = roles_cause_list.pop()
value

<IPython.core.display.Javascript object>

In [809]:
# Create specific_roles_cause_list for above popped value
specific_roles_cause_list = (
    df.loc[
        [
            index
            for index in df[df["info_3_1"].notna()].index
            if value in df.loc[index, "info_3_1"]
        ],
        "info_3_1",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [806]:
# Viewing list sorted by descending length to copy to dictionary below and screen values
sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [805]:
# Example code to quick-check a specific entry
df[df["info_3_1"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,

}

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_3_1`

In [None]:
%%time

# Column to check
column = 'info_3_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

#### Checking Updated `num_categories` Value Counts

In [None]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

#### Verifying that Values in info_3_0 Are Exhausted

In [None]:
# # Verifying that `info_3_0` is exhausted
# df["info_3_0"].value_counts()

#### Dropping info_3_0

In [None]:
# # Dropping info_3_0
# df.drop("info_3_0", axis=1, inplace=True)

# # Checking sample
# df.sample()

#### Observations:
- Our search of column info_3_0 is finished and have dropped that column.
- We will now save our dataset and pick back up in a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean7.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean7.db")
# df.to_sql("wp_life_expect_clean7", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part ]()