# Wikipedia Notable Life Expectancies
# [Notebook  14: Data Cleaning Part 13](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean13_thanak_2022_08_07.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean12.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean12", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98056 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98054,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98055,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
88111,2,Kari Rasmussen,", 88, Norwegian actress and singer.",https://en.wikipedia.org/wiki/Kari_Rasmussen,3,2021,March,,,,,,,,,,,,,88.0,,Norway,,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1
78843,21,Hank Burnine,", 87, American football player .",https://en.wikipedia.org/wiki/Hank_Burnine,5,2020,January,Philadelphia Eagles,,,,,,,,,,,,87.0,,United States of America,,Philadelphia Eagles,1.791759,0,0,0,0,0,0,1,0,0,0,0,0,1
38997,24,Claude Sumner,", 92, Canadian philosopher.",https://en.wikipedia.org/wiki/Claude_Sumner,3,2012,June,,,,,,,,,,,,,92.0,,Canada,,,1.386294,0,0,0,1,0,0,0,0,0,0,0,0,1
36999,16,Juan Carlos,", 66, Spanish footballer.","https://en.wikipedia.org/wiki/Juan_Carlos_(footballer,_born_1945)",6,2012,January,,,,,,,,,,,,,66.0,,Spain,,,1.94591,0,0,0,0,0,0,1,0,0,0,0,0,1
58384,19,Donald Snelgrove,", 91, British Anglican clergyman, Bishop of Hull .",https://en.wikipedia.org/wiki/Donald_Snelgrove,7,2016,May,,,,Bishop of Hull,,,,,,,,,91.0,,United Kingdom of Great Britain and Northern Ireland,,1981 1994,2.079442,0,0,1,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98056 entries, 0 to 98055
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98056 non-null  object 
 1   name                       98056 non-null  object 
 2   info                       98056 non-null  object 
 3   link                       98056 non-null  object 
 4   num_references             98056 non-null  int64  
 5   year                       98056 non-null  int64  
 6   month                      98056 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     23 non-null     object 
 9   info_2                     98024 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [165]:
# Code to check each value
roles_list.pop()

'nazi'

<IPython.core.display.Javascript object>

In [166]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [
            index
            for index in df[df["info_2"].notna()].index
            if "nazi" in df.loc[index, "info_2"]
        ],
        "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [167]:
# Viewing list sorted by descending length to copy to dictionary below and screen values
sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

['nazi eugenicist', 'nazi']

<IPython.core.display.Javascript object>

In [169]:
# Example code to quick-check a specific entry
df[df["info_2"] == "nazi"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
5177,21,Hans Egon Holthusen,", 83, German nazi, writer and academic.",https://en.wikipedia.org/wiki/Hans_Egon_Holthusen,4,1997,January,,,nazi,writer and academic,,,,,,,,,83.0,,Germany,,,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0
7609,15,Gunter d'Alquen,", 87, German nazi correspondent.",https://en.wikipedia.org/wiki/Gunter_d%27Alquen,6,1998,May,,,nazi,,,,,,,,,,87.0,,Germany,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
11847,28,Heinrich Schmidt,", 88, German nazi physician.",https://en.wikipedia.org/wiki/Heinrich_Schmidt_(SS_doctor),5,2000,November,,,nazi,,,,,,,,,,88.0,,Germany,,,1.791759,1,0,0,0,0,0,0,0,0,0,0,0,1
12948,18,Karl Friedrich Titho,", 90, German nazi criminal.",https://en.wikipedia.org/wiki/Karl_Friedrich_Titho,5,2001,June,,,nazi,,,,,,,,,,90.0,,Germany,,,1.791759,0,0,0,0,0,0,0,0,0,1,0,0,1


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [731]:
# Creating lists for each category
politics_govt_law = [
    "advocate for raising awareness of epidermolysis bullosa",
    "advocate for the homeless and mentally ill",
    "CDC spokesperson and anti tobacco advocate",
    "advocate for women and transgender rights",
    "atheism and reproductive rights advocate",
    "advocate for the rights of prostitutes",
    "advocate for women and migrant workers",
    "cystic fibrosis assistance advocate",
    "councillor and community advocate",
    "breast cancer awareness advocate",
    "advocate of freedom of the press",
    "migrant workers' rights advocate",
    "and anti death penalty advocate",
    "and mental health care advocate",
    "disabled people rights advocate",
    "advocate for disability rights",
    "and indigenous rights advocate",
    "nuclear arms control advocate",
    "advocate for homeless rights",
    "transgender rights advocate",
    "consumer and women advocate",
    "endangered species advocate",
    "disability rights advocate",
    "prisoners' rights advocate",
    "indigenous rights advocate",
    "renewable energy advocate",
    "Rohingya rights advocate",
    "workers' rights advocate",
    "Tolowa cultural advocate",
    "consumer rights advocate",
    "women equality advocate",
    "social justice advocate",
    "cancer patient advocate",
    "women literacy advocate",
    "migrant worker advocate",
    "patient rights advocate",
    "and Men rights advocate",
    "animal rights advocate",
    "mental health advocate",
    "tribal rights advocate",
    "independence advocate",
    "advocate for the deaf",
    "anti smoking advocate",
    "euthanasia advocate",
    "insurance advocate",
    "advocate for Jews",
    "cycling advocate",
    "health advocate",
    "rights advocate",
    "autism advocate",
    "women advocate",
    "peace advocate",
    "LGBT advocate",
    "advocate",
    "chairwoman of the Nottawaseppi Huron Band of Potawatomi since",
    "first woman Treasurer of the",
    "trans woman pioneer",
    "Warumungu woman",
    "stateswoman",
    "acting Chief Justice of the Supreme Court of",  # before arts
    "nazi eugenicist",
    "nazi",
]

arts = [
    "traditional waka builder",
    "theatrical set builder",
    "shoemaker and bootmaker",
    "traditional cheesemaker",
    "guitar equipment maker",
    "waterfowl decoy maker",
    "and furniture maker",
    "MC and track maker",
    "paper doll maker",
    "organ maker and",
    "woodwind maker",
    "recorder maker",
    "bootmaker and",
    "bagpipe maker",
    "guitar maker",
    "cabinetmaker",
    "cheesemaker",
    "print maker",
    "knife maker",
    "movie maker",
    "screenmaker",
    "watchmaker",
    "kite maker",
    "glassmaker",
    "moviemaker",
    "sign maker",
    "shoemaker",
    "toy maker",
    "dollmaker",
    "trans woman and performer",
    "anchorwoman",
    "and acting coach",
    "acting coach",
    "acting",
]

sports = [
    "builder and team owner",
    "founder of Lola Cars",
    "sharpshooter",
    "racewalker",
    "Hall of Fame track and field coach",
    "track and field coach and",
    "track and field coach",
]
sciences = [
    "Hall of Fame NASCAR engine",
    "race car builder and",
    "yacht and boat builder",
    "framebuilder",
    "frame maker",
    "macrobiotic diet advocate",  # before politics_govt_law
    "privacy",
    "primatologist",
    "optometrist",
    "lichenologist",
]

business_farming = [
    "Chicago area home builder",
    "watch manufacturer",
    "importer",
    "wine maker",
    "gunmaker",
    "carmaker",
    "vegetarianism advocate",  # before politics_govt_law
    "co founder of Nike",
]
academia_humanities = ["polymath"]
law_enf_military_operator = [
    "chief bombmaker of Hamas",
    "chief executioner of Virginia",
    "only woman to serve in the Foreign Legion",
    "servicewoman during WWII",
    "former comfort woman",
    "servicewoman",
    'battalion commander known as "the executioner of Fort Zeelandia"',
    "former commander in chief of the Strategic Air Command",
    "former commander of the Corps of Gendarmerie of City",
    "warlord and commander of Federal Security Service",
    "divisional commander in the Waffen SS during WWII",
    "commander of the Waffen SS during Worls War II",
    "commander of the SAS during WWII",
    "commander of the Liberation Army",
    "commander in the Royal Air Force",
    "Revolutionary Guard commander",
    "former labor camp commander",
    "separatist field commander",
    "Provisional IRA commander",
    "DPR separatist commander",
    "Army brigade commander",
    "rebel field commander",
    "mujahideen commander",
    "air force commander",
    "WWII tank commander",
    "Navy SEAL commander",
    "al Qaeda commander",
    "guerilla commander",
    "militia commander",
    "Taliban commander",
    "U boat commander",
    "rebel commander",
    "field commander",
    "South commander",
    "army commander",
    "Lehi commander",
    "ISIL commander",
    "PIRA commander",
    "FARC commander",
    "IRA commander",
    "SAS commander",
    "commander",
    "field marshal and chief of Army Staff",
    "Army field marshal",
    "field marshal",
    "firefighter",
    "Māori navigator and",
]
spiritual = ["occultist  practitioner", "occultist", "occult", "spiritualist"]
social = [
    "youth movement founder",
    "literacy advocate",  # before politics_govt_law
]
crime = [
    "concentration camp",
]
event_record_other = [
    "homemaker",
    "domestic abuse survivor",
    "Guinness World Record holder for heaviest woman",
    "woman whose adoption led to ethnic riots in in",
    "brain dead woman kept alive to give birth",
    "woman believed to be world oldest mother",
    "woman with rare slow aging condition",
    "heaviest woman in the world",
    "tallest woman in the world",
    "transgender woman",
    "tattooed woman",
    "mauled woman",
]
other_species = []

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

In [66]:
# Hard-coding info_2 value for entry to categorize as business_farming
index = df[df["link"] == "https://en.wikipedia.org/wiki/Harry_Henshel"].index
df.loc[index, "info_2"] = "watch manufacturer"

# Hard-coding info_2 value for entry to categorize as crime
index = df[df["link"] == "https://en.wikipedia.org/wiki/Venero_Mangano"].index
df.loc[index, "info_2"] = "mafia"

<IPython.core.display.Javascript object>

In [171]:
# Dropping entry that points to event page rather than individual's page and resetting index
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Death_of_Paolo_Gislimberti"
].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [172]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
    "politics_govt_law": politics_govt_law,
    "arts": arts,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [173]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 34s
Wall time: 1min 34s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
50810,23,Pat Quinn,", 71, Canadian ice hockey coach and executive .",https://en.wikipedia.org/wiki/Pat_Quinn_(ice_hockey),39,2014,November,"Philadelphia Flyers, Toronto Maple Leafs, Vancouver Canucks",,,,,,,,,,,,71.0,,Canada,,"Philadelphia Flyers, Toronto Maple Leafs, Vancouver Canucks",3.688879,0,0,0,0,0,0,1,0,0,0,0,0,1
93183,5,Zoran Stanković,", 66, Serbian military officer and politician, minister of defence , COVID-19.",https://en.wikipedia.org/wiki/Zoran_Stankovi%C4%87,4,2021,October,and health,,,minister of defence,COVID,,,,,,,,66.0,,Serbia,,2005 2007 and health 2011 2012,1.609438,0,0,0,0,0,0,0,1,1,0,0,0,2


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [174]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 2537 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [414]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [413]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [412]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "ace" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [411]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [415]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "race"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [409]:
# Creating lists for each category
politics_govt_law = [
    "politologist",
    "banking regulator",
    "econo",
    "elder",
    "social reformer",
    "and member of n State Duma",
    "claimant to headship of the House of Bourbon Two Sicilies",
    "claimant to the headship of the House of Romanov",
    "labor mediator",
    "plaintiff in landmark 'palimony' lawsuit",
    "international law expert",
    "born constitutional law",
    "constitutional law",
    "international law",
    "Labour Member of Parliament for Dunfermline and West Fife",
    "Minister of Labour",
    "Labour",
    "public",
    "Economics",
    "co founder of Greenpeace",
    "peace",
]

arts = [
    "antique dealer and ologist",
    "horologist",
    "variety performer with his brother Bob as half of Bob and Alf Pearson",
    "hustler and nightclub performer",
    "San Francisco street performer",
    "circus performer and owner",
    "Villu Paatu performer",
    "vaudeville performer",
    "burlesque performer",
    "gamelan performer",
    "circus performer",
    "kabuki performer",
    "Kunqu performer",
    "drag performer",
    "performer",
    "website pioneer of ethnic media in",
    "media proprietor",
    "Cheyenne flutist",
    "concert flutist",
    "flutist",
    "newspaper proprietor",
    "newsreel narrator",
    "newspaper founder",
    "newspaper website",
    "newspaper pioneer",
    "newspaper owner",
    "news cameraman",
    "and newsreader",
    "newspaperman",
    "newsreader",
    "newspaper",
]
sports = [
    "figure skating competitor",
    "and figure skating coach",
    "figure skating trainer",
    "figure skating coach",
    "ice skating trainer",
    "speed skating coach",
    "race caller",
]
sciences = [
    "jet engine pioneer",
    "conchologist and malacologist",
    "taxonomist and carcinologist",
    "pulmonologist and internist",
    "cytologist and phycologist",
    "paediatric rheumatologist",
    "echinodermologist and",
    "vintner and",
    "pomologist and apple",
    "dendrochronologist",
    "seed technologist",
    "food technologist",
    "nanotechnologist",
    "biogerontologist",
    "anesthesiologist",
    "geomorphologist",
    "biotechnologist",
    "helminthologist",
    "phenomenologist",
    "parasitologist",
    "rheumatologist",
    "ophthamologist",
    "gerontologist",
    "pulmonologist",
    "carcinologist",
    "orchidologist",
    "methodologist",
    "suicidologist",
    "malacologist",
    "nematologist",
    "hepatologist",
    "technologist",
    "polemologist",
    "dendrologist",
    "speleologist",
    "limnologist",
    "hydrologist",
    "cosmologist",
    "phycologist",
    "metrologist",
    "leprologist",
    "andrologist",
    "petrologist",
    "audiologist",
    "sexologist",
    "ethologist",
    "oenologist",
    "pedologist",
    "metrician",
    "obstetrician",
    "futurist",
    "software developer",
    "opinion pollster",
    "pollster",
    "physical therapist",
    "family therapist",
    "physiotherapist",
    "sex therapist",
    "therapist",
    "spacesuit technician",
    "space",
]

business_farming = [
    "bookmaker",
    "vineyard owner and vintner",
    "Napa Valley vintner",
    "vintner",
    "duty free retailer",
    "clothing retailer",
    "luxury retailer",
    "retailer",
    "venture capitalist and",
    "venture capitalist",
    "shipowner",
    "pharmaceutical company founder",
    "pharmaceutical",
]
academia_humanities = [
    "museum specialist",
    "Slavist and Balkanologist",
    "Kremlinologist",
    "vexillologist",
    "theatrologist",
    "Assyriologist",
    "Hittitologist",
    "assyriologist",
    "hittitologist",
    "Tibetologist",
    "Albanologist",
    "egyptologist",
    "Japanologist",
    "papyrologist",
    "polemologist",
    "Buddhologist",
    "sumerologist",
    "tibetologist",
    "lexicologist",
    "Iranologist",
    "museologist",
    "patrologist",
    "mythologist",
    "semiologist",
    "Turkologist",
    "anthologist",
    "indologist",
    "logologist",
    "genealogist and heraldist",
    "heraldist",
    "Arabist",
    "mediaevalist",
    "Wikimedian",
    "phonetician",
]
law_enf_military_operator = [
    "cryptologist",
    "operative",
    "Ulster loyalist and UVF member",
    "Ulster loyalist",
    "loyalist",
    "warlord",
    "retired Navy Vice Admiral",
    "Navy Vice Admiral",
    "Vice Admiral",
    "brother in law of Osama bin Laden",
    "law enforcement official",
    "outlaw biker",
    "lawman",
    "member of the Resistance during WW II",
    "member of the Resistance during WWII",
    "a member of the Resistance",
    "member of the Resistance",
    "anti Castro mercenary",
    "mercenary",
    "bodyguard",
    "last surviving World War I fighter ace",
    "fighter ace during World War ||",
    "Air Force major and flying ace",
    "flying ace during World War I",
    "Army Air Forces fighter ace",
    "Army Air Force fighter ace",
    "Army Air Forces flying ace",
    "flying ace of the War",
    "Air Force flying ace",
    "WWII fighter ace",
    "WWII flying ace",
    "War flying ace",
    "Air Force ace",
    "fighter ace",
    "flying ace",
]
spiritual = [
    "demonologist",
    "spiritual guardian of Mount Merapi",
    "faith healer",
    "spiritual guru",
    "Grand Ayatollah",
]
social = []
crime = []
event_record_other = [
    "woman",
    "claimant to the title of world oldest person",
    "unverified claimant for world oldest person",
    "alien abduction claimant",
    "longevity claimant",
    "girl whose severe swimming pool injury led to tougher laws",
    "lawn mower traveler",
    "airplane passenger fatally shot by Air Marshals after allegedly claiming he had placed a bomb aboard",
    "face transplant recipient",
]
other_species = [
    "Hindu sacred bull",
    "oldest living polar bear",
    "grizzly bear",
    "polar bear",
    "tortoise claimant for world oldest animal",
    "Thoroughbred race horse and sire",
    "Thoroughbred race horse",
    "thoroughbred race horse",
    "race horse",
]

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [416]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [417]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 2min 32s
Wall time: 2min 32s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
89387,20,M. Narasimham,", 93, Indian economist, governor of the Reserve Bank of India .",https://en.wikipedia.org/wiki/M._Narasimham,9,2021,April,,,,governor of the Reserve Bank of,,,,,,,,,93.0,,India,,1977.0,2.302585,0,0,0,0,0,0,0,0,1,0,0,0,1
29784,19,David Nokes,", 61, British scholar.",https://en.wikipedia.org/wiki/David_Nokes,3,2009,November,,,,,,,,,,,,,61.0,,United Kingdom of Great Britain and Northern Ireland,,,1.386294,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [418]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 2225 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [811]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [810]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [809]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "car dealer" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [808]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [807]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "milliner"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [806]:
# Creating lists for each category
politics_govt_law = [
    "regulator",
    "law",
    "constitutional",
    "hacker",
    "former press secretary for Richard Nixon during the Watergate Scandal",
    "secretary and mistress of Reichsführer SS Heinrich Himmler",
    "secretary who took Adolf Hitler last will and testament",
    "national security adviser and former foreign secretary",
    "Special Assistant and secretary to John F Kennedy",
    "private secretary to Queen Elizabeth II",
    "first secretary of the Leningrad obkom",
    "secretary of foreign affairs",
    "press secretary",
    "heir to the Sarawakan throne",
    "colonial official",
    "colonial governor",
    "anti colonialist",
    "colonial",
    "Trotskyist and one of the founders of the Revolutionary Socialist League",
    "Trotskyist",
    "international relations",
    "Foreign Service Officer and ambassador to and",
    "ambassador to the United Nations",
    "United Nations ambassador",
    "former ambassador to",
    "Arabian ambassador",
    "ambassador to the",
    "and ambassador",
    "ambassador to",
    "ambassador",
    "detainee in Guantanamo Bay Detention Camp",
    "Arabian Guantanamo Bay detainee",
    "former Guantanamo Bay detainee",
    "ex detainee",
    "detainee",
    "Liberal Democrat Member of Parliament",
    "Conservative Member of Parliament",
    "Member of Parliament",
]

arts = [
    "maker",
    "organ builder",
    "yurt builder",
    "glass engraver",
    "media",
    "opera",
    "fashion house owner",
    "fashion promoter",
    "fashion stylist",
    "fashion pioneer",
    "fashion",
    "ballet master and",
    "ballet master",
    "and ballet",
    "ballet",
    "Māori muralist",
    "muralist",
    "pioneer in visual effects and computer animation",
    "virtuoso harmonica player",
    "harmonica player",
    "SeaWorld trainer",
    "animal trainer",
    "litterateur",
    "pornographer",
    "R&B",
    "textile weaver and dyer",
    "Māori master weaver",
    "tapestry weaver",
    "Tlingit weaver",
    "carpet weaver",
    "basketweaver",
    "weaver",
    "punk impresario and club owner",
    "theatrical impresario",
    "nightclub impresario",
    "nightlife impresario",
    "cabaret impresario",
    "impresario",
    "wildlife cameraman",
    "Reuters cameraman",
    "cameraman",
    "sarod player",
    "hairstylist",
    "classic Broadway star and father of Bonnie Raitt",
    "action movie star",
    "reality TV star",
    "movie star",
    "co star of",
    "porn star",
    "bassoonist",
    "screenwriting guru",
    "screen",
    "milliner",
]
sports = [
    "strength coach",
    "Muay fighter",
    "MMA fighter",
    "Muay master and trainer",
    "harness racing trainer",
    "Thoroughbred trainer",
    "greyhound trainer",
    "fitness trainer",
    "fencing trainer",
    "rowing coxswain",
    "rowing champion",
    "former National Football League player",
    "ex National Football League player",
    "National Football League player",
    "ski mogul",
    "former Football League player and NHL referee",
    "former National Football League player",
    "ex National Football League player",
    "National Football League player",
    "Football League player",
    "professional basketball referee",
    "basketball referee and coach",
    "basketball referee",
    "Hall of Fame softball player",
    "softball player",
    "motor racing team owner and constructor",
    "drag racing and hot rod pioneer",
    "motor racing team owner",
    "auto racing team owner",
    "car racing team owner",
    "auto racing pioneer",
    "car racing promoter",
    "racing team owner",
    "racing crew chief",
    "mountain biker",
    "mountain guide",
    "basketball star and innovator",
    "Hall of Fame rodeo cowboy",
    "rodeo cowboy",
    "rodeo rider",
    "rodeo",
]
sciences = [
    "car builder",
    "and enologist",
    "anatomist",
    "Internet software pioneer and computer prodigy",
    "computer expert",
    "quantum physics",
    "physics",
    "elephant expert",
    "CERN secretary",
    "herbalist",
    "Scientist and Medical Researcher",
    "mathematical",
    "game developer",
    "clinician",
    "Nurse Corps",
    "hospital corpsman",
    "genealogist",
]

business_farming = [
    "oilman",
    "real estate tycoon",
    "timber tycoon",
    "tire tycoon",
    "cattle baron",
    "real estate mogul",
    "automotive dealer",
    "insurance broker",
    "clothier and",
    "clothier",
    "whisky distiller",
    "drink distiller",
    "distiller",
    "car dealer",
]
academia_humanities = [
    "heraldry",
    "Esperantist",
    "Germanist",
    "Hellenist",
    "ethnographer",
]
law_enf_military_operator = [
    "East border guard",
    "Waffen SS  guard",
    "security guard",
    "guard",
    "security detective",
    "founder of Al Qaeda",
    "Al Qaeda",
    "Field Marshal",
    "Tzotzil Zapatista rebel",
    "rebel",
    "al Qaeda fighter in Chechnya",
    "oil well fire fighter",
    "independence fighter",
    "WW Special Forces saboteur and trainer of Violette Szabó",
    "resistance trainer",
    "ISIS trainer",
    "sergeant and recipient of the Victoria Cross",
    "war hero and recipient of the Victoria Cross",
    "Gurkha and recipient of the Victoria Cross",
    "Pashtun recipient of the Victoria Cross",
    "recipient of the Victoria Cross",
    "Navy chief cryptologic technician",
    "Navy Commander in Chief",
    "Secretary of the Navy",
    "Director of the Navy",
    "Navy Master Diver",
    "Navy SEAL sniper",
    "head of the Navy",
    "Navy captain",
    "Navy SEAL",
    "Navy",
    "bibliographer",
    "intelligence official who was Deputy Director of the Central Intelligence Agency",
    "intelligence official in the Central Intelligence Agency",
    "former head of East y secret intelligence service",
    "army Green Beret and intelligence sergeant major",
    "intelligence chief of the National Authority",
    "Chief of intelligence of South",
    "head of intelligence services",
    "imagery intelligence analyst",
    "intelligence specialist",
    "intelligence official",
    "intelligence analyst",
    "intelligence chief",
    "Air Force navigator",
    "flight navigator",
    "navigator",
    "marshal of the air force",
    "Air Force marshal",
    "air vice marshal",
    "vice marshal",
    "marshal",
]
spiritual = [
    "Hindu reformer",
    "Hindu guru",
    "Hinduism",
    "Hindu",
    "fortune teller made famous in Bruce Springsteen",
    "fortune teller",
    "ecclesiastical",
    "grand ayatollah",
    "Shi'i ayatollah",
    "ayatollah",
    "and a founding father of the New Age movement",
    "yogi and guru",
    "yogi",
    "Chabad Lubavitch Rabbi and secretary to Menachem Schneerson",
    "Lakota medicine man",
    "Twelver Marja'",
    "Marja'",
    "New Testament",
]
social = []
crime = [
    "failed suicide bomber",
    "organized crime figure in Detroit",
    "organized crime figure",
    "organised crime figure",
    "crime figure",
]
event_record_other = ["homeless man"]
other_species = [
    "Humboldt penguin",
    "penguin",
    "foaled thoroughbred horse",
    "thoroughbred horse",
    "Standardbred harness racing stallion",
    "breeding stallion and racing trotter",
    "barrel racing horse",
    "racing thoroughbred",
    "dressage horse and sire",
    "dressage horse",
    "canine star of sit com Eddie",
    "Great Dane",
]

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [812]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [419]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "general" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

In [None]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean13.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean13.db")
# df.to_sql("wp_life_expect_clean13", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 14]()