# Wikipedia Notable Life Expectancies

# [Notebook 5 of : Data Cleaning](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean4_thanak_2022_06_23.ipynb)

## Context

The


## Objective

The

### Data Dictionary

- Feature: Description

## Importing Necessary Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean3.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean3", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 132652 rows and 23 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,economist,writer,and academic,,,,,,,,68.0,,Ireland,


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
132650,9,Oleg Moliboga,", 69, Russian volleyball player, Olympic champion and coach.",https://en.wikipedia.org/wiki/Oleg_Moliboga,2,2022,June,1980.0,,volleyball player,Olympic champion coach,,,,,,,,,69.0,,Russia,
132651,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Chinese Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
25280,3,Joe Carr,", 82, Irish golfer.",https://en.wikipedia.org/wiki/Joe_Carr,22,2004,June,,,golfer,,,,,,,,,,82.0,,Ireland,
128159,17,Frank Mula,", 71, American television writer and producer .",https://en.wikipedia.org/wiki/Frank_Mula,4,2021,December,"2000, 2001, , Emmy winner ,",,television writer producer,,,,,,,,,,71.0,,United States of America,
116855,1,Todd Gibson,", 83, American racing driver.",https://en.wikipedia.org/wiki/Todd_Gibson,2,2020,December,,,racing driver,,,,,,,,,,83.0,,United States of America,
14016,9,Helen Rollason,", 43, British sports journalist and television presenter, colorectal cancer.",https://en.wikipedia.org/wiki/Helen_Rollason,51,1999,August,,,sports journalist television presenter,colorectal cancer,,,,,,,,,43.0,,United Kingdom of Great Britain and Northern Ireland,
117878,2,Neelamperoor Madhusoodanan Nair,", 84, Indian poet.",https://en.wikipedia.org/wiki/Neelamperoor_Madhusoodanan_Nair,2,2021,January,,,poet,,,,,,,,,,84.0,,India,


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132652 entries, 0 to 132651
Data columns (total 23 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   day             132652 non-null  object 
 1   name            132652 non-null  object 
 2   info            132652 non-null  object 
 3   link            132652 non-null  object 
 4   num_references  132652 non-null  object 
 5   year            132652 non-null  int64  
 6   month           132652 non-null  object 
 7   info_parenth    49830 non-null   object 
 8   info_1          35 non-null      object 
 9   info_2          132604 non-null  object 
 10  info_3          62571 non-null   object 
 11  info_4          12605 non-null   object 
 12  info_5          1497 non-null    object 
 13  info_6          216 non-null     object 
 14  info_7          31 non-null      object 
 15  info_8          6 non-null       object 
 16  info_9          1 non-null       object 
 17  info_10   

<IPython.core.display.Javascript object>

#### Loading `nation_map` from Pickle File to Dictionary nation_map

In [6]:
# Load the nation_map
with open("nation_map.pkl", "rb") as f:
    nation_map = pickle.load(f)

<IPython.core.display.Javascript object>

#### Loading `other_species` list from other_species.csv

In [7]:
# Loading other_species list
other_species_df = pd.read_csv("other_species.csv")
other_species = other_species_df["species"].tolist()
other_species = list(
    set(other_species)
)  # To avoid searching duplicates as we have been adding back to same csv file

<IPython.core.display.Javascript object>

#### Observations:
- With our dataframe, `nation_map`, and `other_species` list loaded, we can proceed to extracting the other features.
- First, we will clean up the divided `info` columns by removing any remaining digits and nationality and country values.
- We will use the same functions from previous notebooks.

#### Function to Save Indices of Rows Matching Regular Expressions Pattern to a List and Print Number of Rows with Match 

In [8]:
# Define a function that takes dataframe, column name, and re pattern as arguments and returns list of indices
# for which column value matches re pattern
def rows_with_pattern(dataframe, column, pattern):
    """
    Takes input of dataframe, column name, and re pattern 
    and returns list of indices for rows that contain match
    for pattern anywhere within value for given column.
    
    dataframe: dataframe
    column: column name
    pattern: re pattern
    """
    index_list = []

    for i in dataframe.index:
        item = dataframe.loc[i, column]
        match = re.search(pattern, item)
        if match:
            index_list.append(i)
    print(
        f"There are {len(index_list)} rows with matching pattern in column '{column}'."
    )
    return index_list

<IPython.core.display.Javascript object>

#### Function to Use rows_with_pattern Function for Multiple Regular Expression Patterns

In [9]:
# Define a function that calls rows_with_pattern function for multiple re patterns
# returning a single list of indices for all rows with any pattern match


def multiple_patterns(dataframe, column, patterns):
    """
    Takes input dataframe, column, and list of re patterns and returns single list 
    of indices for rows in which a match for any pattern is found with re.search
    
    dataframe: dataframe
    column: column name
    patterns: list of re patterns
    """
    rows_combined = []

    # For loop to check each pattern
    for pattern in patterns:

        # List and number of rows matching each pattern
        print(pattern)
        rows_to_check = rows_with_pattern(dataframe, column, pattern)
        print("")

        # Add list for each pattern to combined list
        rows_combined += rows_to_check

    return rows_combined

<IPython.core.display.Javascript object>

### Removing Remaining Digits and Nationality/Country Values from Divided `info` Columns

#### List of Columns to Treat

In [10]:
# List of columns to treat
cols_lst = [
    "info_1",
    "info_2",
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

<IPython.core.display.Javascript object>

#### Removing Digits

In [11]:
# Regular expression for parenthesis and its contents
pattern = r"\d"

# For loop to find indices of rows that have pattern
rows_combined = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    rows_combined += rows_to_check

# Checking a sample of rows
df.loc[rows_combined, :].sample(2)

There are 0 rows with matching pattern in column 'info_1'.
There are 442 rows with matching pattern in column 'info_2'.
There are 2252 rows with matching pattern in column 'info_3'.
There are 1060 rows with matching pattern in column 'info_4'.
There are 69 rows with matching pattern in column 'info_5'.
There are 5 rows with matching pattern in column 'info_6'.
There are 0 rows with matching pattern in column 'info_7'.
There are 0 rows with matching pattern in column 'info_8'.
There are 0 rows with matching pattern in column 'info_9'.
There are 0 rows with matching pattern in column 'info_10'.
There are 0 rows with matching pattern in column 'info_11'.
There are 24403 rows with matching pattern in column 'info_parenth'.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
111795,13,Patrick Simon,", 64, French politician, Mayor of Villers-Bretonneux , advocate of Australia–France relations, COVID-19.",https://en.wikipedia.org/wiki/Patrick_Simon_(politician),6,2020,May,since 2008,,politician,Mayor of Villers Bretonneux,advocate of Australia France relations,COVID 19,,,,,,,64.0,,France,
44463,7,J. Bruce Llewellyn,", 82, American businessman and activist, a founder of 100 Black Men of America, renal failure.",https://en.wikipedia.org/wiki/J._Bruce_Llewellyn,6,2010,April,,,businessman activist,a founder of 100 Black Men of America,renal failure,,,,,,,,82.0,,United States of America,


<IPython.core.display.Javascript object>

In [12]:
# For loop to extract digits
for column in cols_lst:
    for index in set(rows_combined):
        item = df.loc[index, column]
        if item:
            match = re.search(pattern, item)
            if match:
                df.loc[index, column] = re.sub(pattern, "", item)

# Rechecking number and example rows after treatment
# For loop to find indices of rows that have pattern
recheck_rows = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    recheck_rows += rows_to_check

There are 0 rows with matching pattern in column 'info_1'.
There are 0 rows with matching pattern in column 'info_2'.
There are 0 rows with matching pattern in column 'info_3'.
There are 0 rows with matching pattern in column 'info_4'.
There are 0 rows with matching pattern in column 'info_5'.
There are 0 rows with matching pattern in column 'info_6'.
There are 0 rows with matching pattern in column 'info_7'.
There are 0 rows with matching pattern in column 'info_8'.
There are 0 rows with matching pattern in column 'info_9'.
There are 0 rows with matching pattern in column 'info_10'.
There are 0 rows with matching pattern in column 'info_11'.
There are 0 rows with matching pattern in column 'info_parenth'.


<IPython.core.display.Javascript object>

#### Removing Any Remaining Matches with  `nation_map` Keys and Values

In [13]:
%%time

# For loop to extract remaining information matching items in nation_map
for column in cols_lst:
    dataframe = df[df[column].notna()]
    for nationality, country in nation_map.items():
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if nationality + ' ' in item or country + ' ' in item or item.endswith(nationality) or item.endswith(country):
                    df.loc[index, column] = item.replace(nationality, "").strip().replace(country,'').strip()

CPU times: total: 16min 17s
Wall time: 16min 18s


<IPython.core.display.Javascript object>

#### Observations:
- After that bit of tidying, we can proceed to extracting `known_for_1` values.
- The bulk of these values should be in `info_2`, according to the Wikipedia defined fields, so we will start there.

## Extracting `known_for` Data
Our goal will be to have some broader categories into which the specific values will fit.  `known_for` is a diverse feature, in that an individual may be known for a long-term role or roles, a specific event, a relationship with another person who is famous, etc.  So, to some extent we will see what we find and adapt as we go.

Also, we will abandon searching left to right as an individual may fit more than one category, and in no particular order.  For example, Ronald Reagan, is entered as "American actor and politician".  He is most known as the 40th president of The United States, so if we prioritized the first value, he would fit only into the category containing actor.  At the same time, it may have been his acting career that led to his political career.  Both arenas are relevant, so we will aim to capture all categories for an individual.  Later, when there are duplicate categories for an indivual, we can remove the redundant values.

We will take the following approach:
1. create and check a list of unique values in `info_2` that have a minimum number repeated, sufficient to create sets for each category, but not so exhaustive to be time prohibitive to manually enter.
2. using the pop() method, add each role to it's associated category's set, below.
3. combine the sets for each category into one dictionary.
4. search for the values in the dictionary and extract the category key value to a new column `known_for_1`, `known_for_2`, etc.

In [14]:
# Obtaining values for column and their counts
col_values = df["info_2"].value_counts()

# Creating a list for values that occur more than set number of time
roles_list = [index for index in col_values.index if col_values[index] > 30]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

We will examine the top 447 unique values in info_2.


<IPython.core.display.Javascript object>

In [15]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

<IPython.core.display.Javascript object>

In [100]:
# Creating lists for each category
politics_govt_law = [
    "politician",
    "economist",
    "attorney",
    "trade unionist",
    "unionist",
    "aristocrat",
    "diplomat",
    "lawyer",
    "activist",
    "civil rights",
    "federal",
    "judge",
    "political",
    "politics",
    "royal",
    "civil servant",
    "jurist",
    "judge",
    "conservationist",
    "government official",
    "government",
    "barrister",
    "militant",
    "environmentalist",
    "public servant",
    "King",
    "Queen",
    "Princess",
    "Prince",
    "President",
    "Prime Minister",
    "leader",
    "Nazi",
    "Administration",
    "Ambassador",
    "ambassador",
]

arts = [
    "actor",
    "dancer",
    "choreographer",
    "model",
    "television",
    "jazz",
    "singer",
    "composer",
    "conductor",
    "journalist",
    "writer",
    "saxophonist",
    "film director",
    "comedian",
    "photojournalist",
    "poet",
    "actress",
    "film",
    "editor",
    "drummer",
    "producer",
    "songwriter",
    "publisher",
    "author",
    "violinist",
    "rapper",
    "musician",
    "animator",
    "filmmaker",
    "pianist",
    "historian",
    "comic",
    "screenwriter",
    "fashion",
    "designer",
    "guitarist",
    "voice",
    "opera",
    "cinematographer",
    "playwright",
    "sculptor",
    "novelist",
    "photographer",
    "architect",
    "painter",
    "artist",
    "disc jockey",
    "dj",
    "DJ",
    "MC",
    "bridge player",
    "tenor",
    "trombonist",
    "filmmaker",
    "ballerina",
    "bassist",
    "film critic",
    "critic",
    "personality",
    "organist",
    "operatic",
    "lyricist",
    "translator",
    "visual artist",
    "soprano",
    "cellist",
    "broadcaster",
    "chef",
    "literary critic",
    "ballet",
    "illustrator",
    "theatre director",
    "trumpeter",
    "presenter",
    "sportscaster",
    "cartoonist",
    "sportswriter",
    "choral",
    "music",
    "arts",
    "dance",
]
sports = [
    "football",
    "footballer",
    "Olympic",
    "skier",
    "hockey",
    "soccer",
    "cricket",
    "soccer",
    "sprinter",
    "equestrian",
    "gymnast",
    "fencer",
    "chess",
    "wrestler",
    "swimmer",
    "basketball",
    "hurler",
    "sailor",
    "rower",
    "rugby",
    "athlete",
    "golfer",
    "boxer",
    "tennis",
    "cyclist",
    "racing",
    "driver",
    "cricketer",
    "baseball",
    "speedway rider",
    "speedway",
    "rider",
    "badminton",
    "sport shooter",
    "runner",
    "umpire",
    "judoka",
    "volleyball",
    "track and field",
    "track",
    "bobsledder",
    "canoer",
    "bodybuilder",
    "skater",
    "curler",
    "Olympic diver",
    "martial artist",
    "racer",
    "handball",
    "ski jumper",
    "racehorse trainer",
    "racecar driver",
    "hurdler",
    "polo",
    "Olympic shooter",
    "weightlifter",
    "Baseball",
    "mountaineer",
    "jockey",
    "Olympic sports shooter",
    "referee",
    "general manager",
    "sports",
    "sport",
    "athletics",
    "athletic",
]
sciences = [
    "engineer",
    "physicist",
    "geologist",
    "psychiatrist",
    "botanist",
    "biologist",
    "anthropologist",
    "astronomer",
    "biochemist",
    "scientist",
    "computer",
    "programmer",
    "archaeologist",
    "psychologist",
    "sociologist",
    "physician",
    "chemist",
    "physicist",
    "mathematician",
    "cosmonaut",
    "pediatrician",
    "astronaut",
    "entomologist",
    "cardiologist",
    "doctor",
    "nurse",
    "immunologist",
    "meteorologist",
    "medical researcher",
    "ornithologist",
    "neuroscientist",
    "microbiologist",
    "zoologist",
    "geographer",
    "inventor",
    "geneticist",
    "surgeon",
    "astrophysicist",
    "statistician",
    "sciences",
    "science",
    "mathematics",
    "math",
    "physics",
    "chemistry",
    "biology",
    "epidemiology",
]

business = [
    "executive",
    "businessman",
    "banker",
    "entrepreneur",
    "real estate developer",
    "restaurateur",
    "businesswoman",
    "sports administrator",
    "business",
    "banking",
    "bank",
]
academia_humanities = [
    "scholar",
    "linguist",
    "educator",
    "philosopher",
    "academic",
    "military historian" "historian",
    "educationalist",
    "philologist",
    "librarian",
    "industrialist",
    "professor",
    "musicologist",
    "academia",
    "education",
    "college",
    "university",
    "humanities",
]
law_enf_military_operator = [
    "officer",
    "army",
    "Army",
    "police",
    "admiral",
    "soldier",
    "Air Force",
    "intelligence",
    "major",
    "lieutenant",
    "admiral",
    "fighter pilot",
    "pilot",
    "naval",
    "Navy",
    "aviator",
    "general",
    "CIA",
    "FBI",
    "law enforcement",
    "military",
    "police",
    "Marines",
    "marine",
    "Coast Guard",
    "IRA",
    "CIA",
]
spiritual = [
    "rabbi",
    "Catholic",
    "priest",
    "Anglican",
    "cardinal",
    "theologian",
    "prelate",
    "Orthodox",
    "Episcopal",
    "bishop",
    "Jesuit",
    "hierarch",
    "Islamic",
    "religious leader",
    "religious",
    "religion",
]
social = ["philanthropist", "socialite", "philanthropy"]
crime = [
    "serial killer",
    "murderer",
    "convicted",
    "terrorist",
    "mobster",
    "criminal",
    "suspect",
    "crime",
    "guilty",
]
event_record_other = [
    "Holocaust survivor",
    "victim",
    "survivor",
    "supercentenarian",
    "oldest person",
    "centarian",
    "oldest",
]
other_species.append("Tree")

<IPython.core.display.Javascript object>

#### Observations:
- We have a good start on `known_for_1` values for which to search.  Some other roles that have been observed previously we have added to the list also.
- Note that roles such as sportswriter and sports broadcaster, though associated with sports, are also included in arts, to align with the underlying nature of the work itself.
- Let us combine them into one dictionary, taking care to put arts last to avoid missing values for "martial artist" and to put spiritual before politics_govt_law so that "leader" in politics_govt_law comes after "religious leader" in relgion.  Likewise "general manager" in sports will come before "general" in law_enf_military_operator and "military historian" in academia_humanities will come before "military" in "law_enf_military_operator".
- We will also include an other_species category here, again putting it last so that trainer and breeder in sports, come before racehorse in other_species.
- Then, we can proceed to extract the category to a new column, `known_for_1`.

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [101]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "event_record_other": set(event_record_other),
    "crime": set(crime),
    "social": set(social),
    "spiritual": set(spiritual),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "other_species": set(other_species),
}

<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_1` Column from `info_1`

In [18]:
# Initializing known_for_1 column
df["known_for_1"] = ""

<IPython.core.display.Javascript object>

In [19]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 29 values in extract_to column.
CPU times: total: 1.73 s
Wall time: 1.74 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1
55280,2,Oliver,", common chimpanzee 55, Congolese-born noted for his upright stature and humanlike traits.",https://en.wikipedia.org/wiki/Oliver_(chimpanzee),14,2012,June,,common,noted for his upright stature humanlike traits,,,,,,,,,,55.0,,"Congo, Republic of the",,other_species
99880,13,André Queillé,", 87 French Olympic boxer .",https://en.wikipedia.org/wiki/Andr%C3%A9_Queill%C3%A9,2,2018,December,,Olympic,,,,,,,,,,,87.0,,France,,sports


<IPython.core.display.Javascript object>

#### Observations:
- Once again, the `info_1` column has provided a small sample on which to test our code, which appears to be working.
- We can move on to extracting additional `known_for` values in `info_1` to `known_for_2`.
- Sir Robin Brook is a good example of an individual who would have 3 categories with our approach--business, business, and sports.  So, we will have enough `known_for` columns to extract all values for all entries.  Removing these values has the added benefit of simplifying the next search for `cause_of_death`.

#### Extracting Category to `known_for_2` Column from `info_1`

In [20]:
# Initializing known_for_2 column
df["known_for_2"] = ""

<IPython.core.display.Javascript object>

In [21]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 11 values in extract_to column.
CPU times: total: 2.89 s
Wall time: 2.89 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2
104434,12,Franz Eisl,", 98. Austrian Olympic sailor .",https://en.wikipedia.org/wiki/Franz_Eisl,1,2019,July,",",,,,,,,,,,,,98.0,,Austria,,sports,sports
99212,5,Kenneth Roy,", 73. Scottish broadcaster and writer.",https://en.wikipedia.org/wiki/Kenneth_Roy,2,2018,November,,,,,,,,,,,,,73.0,,Scotland,,arts,arts


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_3` Column from `info_1`

In [22]:
# Initializing known_for_2 column
df["known_for_3"] = ""

<IPython.core.display.Javascript object>

In [23]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

There are 0 values in extract_to column.
CPU times: total: 2.91 s
Wall time: 2.91 s


<IPython.core.display.Javascript object>

In [24]:
# Checking remaining unique values in info_1
df["info_1"].value_counts()

                    19
early                2
gridiron  player     1
man                  1
social               1
of                   1
Jules Engel          1
aka                  1
Jr                   1
professional         1
automotive           1
materials            1
common               1
player               1
coach                1
of the Year          1
Name: info_1, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We have extracted all of the `known_for` information present in `info_1`.
- It is time to proceed with extracting the same from `info_2`, the column that should contain the bulk of this feature's values.

#### Extracting Category to `known_for_1` Column from `info_2`

In [25]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 123411 values in extract_to column.
CPU times: total: 2min 51s
Wall time: 2min 51s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3
13868,13,Donald D. Engen,", 75, United States Navy vice admiral, Administrator of the FAA and Director of the National Air and Space Museum.",https://en.wikipedia.org/wiki/Donald_D._Engen,9,1999,July,,,Navy vice,Administrator of the FAA Director of the National Air Space Museum,,,,,,,,,75.0,,United States of America,,law_enf_military_operator,,
31552,4,Frank Arthur Calder,", 91, Canadian aboriginal politician.",https://en.wikipedia.org/wiki/Frank_Arthur_Calder,3,2006,November,,,aboriginal,,,,,,,,,,91.0,,Canada,,politics_govt_law,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_2` Column from `info_2`

In [26]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 34546 values in extract_to column.
CPU times: total: 3min 45s
Wall time: 3min 45s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3
128880,11,Louis Dupré,", 96, Belgian religious philosopher.",https://en.wikipedia.org/wiki/Louis_Dupr%C3%A9_(philosopher),4,2022,January,,,,,,,,,,,,,96.0,,Belgium,,spiritual,academia_humanities,
9066,14,Eugene Goossen,", 76, American art critic and art historian, pneumonia.",https://en.wikipedia.org/wiki/Eugene_Goossen,5,1997,July,,,art art,pneumonia,,,,,,,,,76.0,,United States of America,,arts,arts,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_3` Column from `info_2`

In [27]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 3310 values in extract_to column.
CPU times: total: 1min 20s
Wall time: 1min 20s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3
113478,17,John Lewis,", 80, American civil rights leader and politician, member of the U.S. House of Representatives , pancreatic cancer.",https://en.wikipedia.org/wiki/John_Lewis,288,2020,July,"since , Presidential Medal of Freedom recipient",,,member of the House of Representatives,pancreatic cancer,,,,,,,,80.0,,United States of America,,politics_govt_law,politics_govt_law,politics_govt_law
114673,31,Édouard Karemera,", 69, Rwandan politician and convicted war criminal.",https://en.wikipedia.org/wiki/%C3%89douard_Karemera,6,2020,August,,,war,,,,,,,,,,69.0,,Rwanda,,crime,crime,politics_govt_law


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_4` Column from `info_2`

In [28]:
# Initializing known_for_4 column
df["known_for_4"] = ""

<IPython.core.display.Javascript object>

In [29]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_3']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 160 values in extract_to column.
CPU times: total: 16.6 s
Wall time: 16.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4
18224,18,Ivan Neill,", 88, British Anglican priest and Army officer.",https://en.wikipedia.org/wiki/Ivan_Neill_(priest),12,2001,June,,,,,,,,,,,,,88.0,,United Kingdom of Great Britain and Northern Ireland,,spiritual,spiritual,law_enf_military_operator,law_enf_military_operator
99861,12,Pavle Strugar,", 85, Montenegrin military officer and convicted war criminal.",https://en.wikipedia.org/wiki/Pavle_Strugar,6,2018,December,People Army,,war,,,,,,,,,,85.0,,Montenegro,Serbia,crime,crime,law_enf_military_operator,law_enf_military_operator


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_5` Column from `info_2`

In [30]:
# Initializing known_for_5 column
df["known_for_5"] = ""

<IPython.core.display.Javascript object>

In [31]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_3']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!='']

There are 4 values in extract_to column.
CPU times: total: 16.6 s
Wall time: 16.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5
3721,26,Edgar Williams,", 82, British Army military intelligence officer and historian.",https://en.wikipedia.org/wiki/Edgar_Williams,29,1995,June,,,,,,,,,,,,,82.0,,United Kingdom of Great Britain and Northern Ireland,,law_enf_military_operator,law_enf_military_operator,law_enf_military_operator,law_enf_military_operator,arts
28188,12,Charlie Norman,", 84, Swedish jazz pianist and film music writer.",https://en.wikipedia.org/wiki/Charlie_Norman,14,2005,August,,,,,,,,,,,,,84.0,,Sweden,,arts,arts,arts,arts,arts
30095,3,Howard Thomas Markey,", 85, American federal judge and U.S. Air Force major general, first chief judge of the United States Court of Appeals for the Federal Circuit.",https://en.wikipedia.org/wiki/Howard_Thomas_Markey,3,2006,May,,,,first chief judge of the Court of Appeals for the Federal Circuit,,,,,,,,,85.0,,United States of America,,law_enf_military_operator,law_enf_military_operator,law_enf_military_operator,politics_govt_law,politics_govt_law
57402,11,Beano Cook,", 81, American college football historian and television sports analyst .",https://en.wikipedia.org/wiki/Beano_Cook,9,2012,October,ESPN,,analyst,,,,,,,,,,81.0,,United States of America,,academia_humanities,sports,sports,arts,arts


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_6` Column from `info_2`

In [32]:
# Initializing known_for_6 column
df["known_for_6"] = ""

<IPython.core.display.Javascript object>

In [33]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_3']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

There are 0 values in extract_to column.
CPU times: total: 14.4 s
Wall time: 14.4 s


<IPython.core.display.Javascript object>

In [34]:
# Checking the number of missing values in known_for_1
print(
    f'There are {len(df[df["known_for_1"] == ""])} missing values in known_for_1 column.'
)

There are 9241 missing values in known_for_1 column.


<IPython.core.display.Javascript object>

#### Observations:
- We have exhuasted our search of `info_2` using the current version of `known_for_dict` and have under 10,000 remaining missing values in `known_for_1`.
- Let us examine some of the remaining unique values in `info_2` and ammend our lists and dictionary.

#### Checking Remaining `info_2` Values

In [35]:
# Obtaining values for column and their counts
col_values = df[df["known_for_1"] == ""]["info_2"].value_counts()

# Creating a list for values that occur more than set number of time
roles_list = [index for index in col_values.index if col_values[index] > 2]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

We will examine the top 557 unique values in info_2.


<IPython.core.display.Javascript object>

In [36]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

<IPython.core.display.Javascript object>

#### Updating Category Lists for `known_for_dict`

In [102]:
# Appending category lists
sciences = sciences + [
    "volcanologist",
    "gerontologist",
    "pollster",
    "genealogist",
    "software developer",
    "video game developer",
    "anaesthetist",
    "geomorphologist",
    "carcinologist",
    "weatherman",
    "aerodynamicist",
    "limnologist",
    "control theorist",
    "plant pathologist",
    "pathologist",
    "medical practitioner",
    "optometrist",
    "neuroendocrinologist",
    "endocrinologist",
    "anesthesiologist",
    "obstetrician",
    "zookeeper",
    "game developer",
    "forester",
    "embryologist",
    "urologist",
    "arachnologist",
    "lichenologist",
    "anatomist",
    "mineralogist",
    "gastroenterologist",
    "sexologist",
    "bacteriologist",
    "gynecologist",
    "horticulturalist",
    "seismologist",
    "parasitologist",
    "neurophysiologist",
    "primatologist",
    "hydrologist",
    "indologist",
    "ethologist",
    "herbalist",
    "econometrician",
    "cryptographer",
    "toxicologist",
    "haematologist",
    "hematologist",
    "plant ecologist",
    "ecologist",
    "ufologist",
    "crystallographer",
    "gynaecologist",
    "climatologist",
    "glaciologist",
    "demographer",
    "dentist",
    "archeologist",
    "ichthyologist",
    "nephrologist",
    "dermatologist",
    "veterinarian",
    "physiologist",
    "horticulturist",
    "cancer researcher",
    "urban planner",
    "nutritionist",
    "pharmac",
    "oncologist",
    "metallurgist",
    "herpetologist",
    "ophthalmologist",
    "palaeontologist",
    "oceanographer",
    "agronomist",
    "paediatrician",
    "mycologist",
    "naturalist",
    "criminologist",
    "epidemiologist",
    "psychotherapist",
    "neurologist",
    "paleontologist",
    "virologist",
    "psychoanalyst",
    "wildlife",
    "biotech",
    "technology",
]
politics_govt_law = politics_govt_law + [
    "justice",
    "anarchist",
    "secretary",
    "partisan",
    "resistance",
    "Resistance",
    "foreign policy",
    "chieftain",
    "communist",
    "Trotskyist",
    "herald",
    "human rights",
    "campaigner",
    "prince",
    "insurgent",
    "detainee",
    "Resistance",
    "revolutionary",
    "elder",
    "Governor",
    "governor" "General",
    "Vice",
    "Admiral",
    "peer",
    "landowner",
    "union",
    "sultan",
    "Sultan",
    "Senator",
    "Representative",
    "loyalist",
    "Supreme Court",
    "Justice",
    "Chief Justice",
    "Conservative",
    "conservative",
    "Liberal",
    "liberal",
    "MP",
    "parliamentarian",
    "pariliament",
    "Parliament",
    "colonial",
    "mayor",
    "Mayor",
    "ruler",
    "republican",
    "Republican",
    "Democrat",
    "democrat",
    "bureaucrat",
    "conspiracy theorist",
    "jihadist",
    "whistleblower",
    "prime minister",
    "countess",
    "District",
    "Judge",
    "foreign minister",
    "Foreign Minister",
    "Islamist",
    "peeress",
    "legislator",
    "first lady",
    "First Lady",
    "courtier",
    "senior",
    "monarch",
    "statesman",
    "lobbyist",
    "solicitor",
    "senator",
    "representative",
    "nationalist",
    "protester",
    "noble",
    "prosecutor",
    "magistrate",
    "public official",
    "feminist",
    "dissident",
    "candidate",
    "congress",
    "administrator",
    "president",
    "Politician",
]
law_enf_military_operator = law_enf_military_operator + [
    "veteran",
    "forester",
    "Navajo code talker",
    "security",
    "fighter",
    "paramilitary",
    "guerrilla",
    "fighter ace",
    "flying ace",
    "firefighter",
    "Medal of Honor",
    "secret agent",
    "codebreaker",
    "Special Operations",
    "warlord",
    "Victoria Cross",
    "mercenary",
    "World War II",
    "colonel",
    "Marine",
    "Secret Service",
    "commander",
    "Air Chief",
    "Marshal",
    "marshal",
    "aviation",
    "airman",
    "spy",
    "emir",
]
sports = sports + [
    "sport",
    "jumper",
    "athletic",
    "shot putter",
    "Olympian",
    "fencing",
    "bandy",
    "Banty",
    "rodeo",
    "rowing",
    "lacrosse",
    "yoga",
    "futsal",
    "heavyweight",
    "Heavyweight",
    "balloonist",
    "racewalker",
    "hurling",
    "biker",
    "scuba",
    "master of the horse",
    "shogi",
    "Football",
    "softball",
    "free diver",
    "greyhound trainer",
    "goalkeeper",
    "mountain",
    "boxing",
    "hunter",
    "angler",
    "aikidoka",
    "aikido",
    "cave diver",
    "alpinist",
    "powerlifter",
    "karate",
    "rowing",
    "coxswain",
    "skater",
    "skating",
    "Go player",
    "orienteer",
    "orienteer",
    "ten pin",
    "karateka",
    "wrestling",
    "announcer",
    "golf",
    "netball",
    "poker",
    "slalom",
    "canoe",
    "pool player",
    "NFL",
    "CFL",
    "CFL",
    "bowl",
    "pole vault",
    "strongman",
    "yachtsman",
    "snowboard",
    "skateboard",
    "archer",
    "climber",
    "swim",
    "squash",
    "climber",
    "shot put",
    "luger",
    "walker",
    "adventurer",
    "diver",
    "surfer",
    "explorer",
    "bullfighter",
    "sprint",
    "pitcher",
    "snooker",
    "rejoneador",
    "matador",
]

academia_humanities = academia_humanities + [
    "Esperantist",
    "phonetician",
    "vexillologist",
    "Byzantinist",
    "logician",
    "Turkologist",
    "bioethicist",
    "Mayanist",
    "Hellenist",
    "crossword compiler",
    "cruciverbalist",
    "Hispanist",
    "Arabist",
    "semiotician",
    "Assyriologist",
    "literary theorist",
    "schoolmaster",
    "schoolteacher",
    "intellectual",
    "organizational theorist",
    "information theorist",
    "orientalist",
    "medievalist",
    "classicist",
    "archivist",
    "museum",
    "numismatist",
    "ethnologist",
    "lexicographer",
    "folklorist",
    "philatelist",
    "sinologist",
    "teacher",
    "Egyptologist",
    "Japanologist",
    "Iranologist",
    "Indologist",
]
business = business + [
    "retailer",
    "grocer",
    "auctioneer",
    "baker",
    "car dealer",
    "clothier",
    "food manufacturer",
    "manufacturer",
    "real estate",
    "shipowner",
    "company director",
    "distiller",
    "financial",
    "financial",
    "finance",
    "media owner",
    "printer",
    "management consultant",
    "investment manager",
    "vintner",
    "brewer",
    "jeweller",
    "shipping magnate",
    "nightclub owner",
    "bookseller",
    "billionaire",
    "stockbroker",
    "farmer",
    "hotel",
    "accountant",
    "property developer",
    "investor",
    "financier",
    "winemaker",
]
crime = crime + [
    "murder suspect",
    "suspect",
    "concentration camp guard",
    "drug dealer",
    "drug lord",
    "convict",
    "drug trafficker",
    "spree killer",
    "gangster",
    "mafia",
    "mob",
    "sex offender",
]
spiritual = spiritual + [
    "Presbyterian",
    "spiritual",
    "Zen",
    "Buddhist",
    "monk",
    "ayatollah",
    "Ayatollah",
    "psychic",
    "yogi",
    "Marja",
    "Trappist",
    "Christian",
    "missionary",
    "Benedictine",
    "nun",
    "faith",
    "healer",
    "Methodist",
    "archdeacon",
    "Baptist",
    "cleric",
    "televangelist",
    "clergy",
    "astrolog",
    "evangelist",
    "minister",
    "pastor",
    "lama",
    "imam",
]
arts = arts + [
    "milliner",
    "memoirist",
    "columnist",
    "bluegrass",
    "fiddler",
    "perfumer",
    "performer",
    "acting",
    "organ builder",
    "art patron",
    "TV",
    "reporter",
    "Pulitzer Prize",
    "script",
    "santoor",
    "mandolin",
    "oenologist",
    "radio",
    "host",
    "horn player",
    "cameraman",
    "tuba",
    "surfboard shaper",
    "impresario",
    "weaver",
    "oud player",
    "blues",
    "reporter",
    "animal trainer",
    "harmonica",
    "guitar",
    "movie",
    "woodworker",
    "R&B",
    "antique",
    "craftsman",
    "double bass",
    "keyboard",
    "drag queen",
    "trumpet",
    "hairstylist",
    "etiquette",
    "accordion",
    "radio",
    "mural",
    "Calypso",
    "calypso",
    "bassoon",
    "animation",
    "correspondent",
    "taekwondo",
    "potter",
    "studio",
    "illusionist",
    "magici",
    "circus",
    "documentar",
    "YouTube",
    "satirist",
    "beauty pageant",
    "baritone",
    "impressionist",
    "performer",
    "stunt",
    "hairdresser",
    "theatre",
    "announcer",
    "flutist",
    "flute",
    "clown",
    "harp",
    "bass player",
    "blog",
    "vlog",
    "show",
    "ventriloquist",
    "typographer",
    "calligrapher",
    "band manager",
    "tabla",
    "storyteller",
    "arranger",
    "news",
    "curator",
    "violist",
    "printmaker",
    "oboist",
    "sound",
    "beauty queen",
    "literary agent",
    "contralto",
    "ceramicist",
    "vocal",
    "ceramist",
    "banjo",
    "publicist",
    "flautist",
    "harpsichord",
    "decorator",
    "talent",
    "accordionist",
    "casting",
    "stage director",
    "theater",
    "humorist",
    "essayist",
    "biographer",
    "art collector",
    "puppeteer",
    "art dealer",
    "drama",
    "art director",
    "entertainer",
    "percussion",
    "clarinet",
    "director",
    "stage",
    "bandoneon",
    "choir",
    "Choir",
    "porn",
    "star",
    "sarod",
    "instrument",
    "saxophon",
    "dialect coach",
]
social = social + [
    "heir",
    "volunteer",
    "public figure",
    "humanitarian",
    "social worker",
]
event_record_other = event_record_other + [
    "homeless",
    "student",
    "teenager",
    "fan of",
    "worker",
    "child",
    "boy",
    "girl",
    "employee",
    "longevity claimant",
    "record holder",
    "heaviest",
    "tallest",
    "shortest",
    "oldest",
    "youngest",
    "last",
    "first",
    "centenarian",
    "heaviest",
    "smallest",
    "muse",
]
other_species = other_species + [
    "elephant",
    "Great Dane",
    "greyhound",
    "thoroughbred",
]

<IPython.core.display.Javascript object>

#### Updating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [103]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "event_record_other": set(event_record_other),
    "crime": set(crime),
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "spiritual": set(spiritual),
    "other_species": set(other_species),
}

<IPython.core.display.Javascript object>

#### Observations:
- Now we will repeat extracting `known_for` values from `info_2` with the updated dictionary.

#### Extracting Category to `known_for_1` Column from `info_2` with Updated `known_for_dict`

In [39]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 130411 values in extract_to column.
CPU times: total: 55.8 s
Wall time: 56 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6
74832,9,Tut Taylor,", 91, American bluegrass musician .",https://en.wikipedia.org/wiki/Tut_Taylor,5,2015,April,,,bluegrass ian,,,,,,,,,,91.0,,United States of America,,arts,,,,,
37955,4,Peter Vansittart,", 88, British writer.",https://en.wikipedia.org/wiki/Peter_Vansittart,6,2008,October,,,,,,,,,,,,,88.0,,United Kingdom of Great Britain and Northern Ireland,,arts,,,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_2` Column from `info_2` with Updated `known_for_dict`

In [40]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 43861 values in extract_to column.
CPU times: total: 8min 2s
Wall time: 8min 2s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6
53048,25,Mary Semans,", 91, American heiress and philanthropist.",https://en.wikipedia.org/wiki/Mary_Semans,22,2012,January,,,ess,,,,,,,,,,91.0,,United States of America,,social,social,,,,
111990,21,Lew Byong-hyun,", 95, South Korean military officer and diplomat, Ambassador to the United States .",https://en.wikipedia.org/wiki/Lew_Byong-hyun,13,2020,May,,,,Ambassador to the,,,,,,,,,95.0,,South Korea,,law_enf_military_operator,law_enf_military_operator,politics_govt_law,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_3` Column from `info_2` with Updated `known_for_dict`

In [41]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 6537 values in extract_to column.
CPU times: total: 3min 51s
Wall time: 3min 51s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6
7917,5,Robert Elem,", 69, American blues guitarist and singer.",https://en.wikipedia.org/wiki/Robert_%22Big_Mojo%22_Elem,13,1997,February,,,,,,,,,,,,,69.0,,United States of America,,arts,arts,arts,,,
100336,4,Wiet Van Broeckhoven,", 69, Belgian radio presenter and writer.",https://en.wikipedia.org/wiki/Wiet_Van_Broeckhoven,5,2019,January,,,,,,,,,,,,,69.0,,Belgium,,arts,arts,arts,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_4` Column from `info_2` with Updated `known_for_dict`

In [42]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 597 values in extract_to column.
CPU times: total: 4min 56s
Wall time: 4min 56s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6
13144,2,David Ackles,", 62, American singer-songwriter and child actor, lung cancer.",https://en.wikipedia.org/wiki/David_Ackles,17,1999,March,,,,lung cancer,,,,,,,,,62.0,,United States of America,,arts,arts,arts,event_other,,
18661,18,Jack Elliott,", 74, American film and television music composer, conductor and arranger .",https://en.wikipedia.org/wiki/Jack_Elliott_(composer),5,2001,August,", , , ,",,,conductor arranger,,,,,,,,,74.0,,United States of America,,arts,arts,arts,arts,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_5` Column from `info_2` with Updated `known_for_dict`

In [43]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 37 values in extract_to column.
CPU times: total: 4min 10s
Wall time: 4min 10s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6
101105,4,Ward Thomas,", 95, British television executive and World War II fighter pilot.",https://en.wikipedia.org/wiki/Ward_Thomas_(television_executive),26,2019,February,,,,,,,,,,,,,95.0,,United Kingdom of Great Britain and Northern Ireland,,business,law_enf_military_operator,arts,law_enf_military_operator,law_enf_military_operator,
105752,18,Chuck Dauphin,", 45, American sports radio broadcaster and country music journalist, complications from diabetes.",https://en.wikipedia.org/wiki/Chuck_Dauphin,4,2019,September,,,country,complications from diabetes,,,,,,,,,45.0,,United States of America,,sports,arts,arts,arts,arts,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_6` Column from `info_2` with Updated `known_for_dict`

In [44]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 2 values in extract_to column.
CPU times: total: 4min 31s
Wall time: 4min 31s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6
24749,26,Jack Sperling,", 81, American jazz drummer who performed in big bands and as a studio musician for movies and television.",https://en.wikipedia.org/wiki/Jack_Sperling,2,2004,February,,,who performed in big bands as a ian for s,,,,,,,,,,81.0,,United States of America,,arts,arts,arts,arts,arts,arts
6088,18,Chet Forte,", 60, American television director and sports radio talk show host, heart attack.",https://en.wikipedia.org/wiki/Chet_Forte,12,1996,May,,,talk,heart attack,,,,,,,,,60.0,,United States of America,,sports,arts,arts,arts,arts,arts


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_7` Column from `info_2` with Updated `known_for_dict`

In [45]:
# Initializing known_for_7 column
df["known_for_7"] = ""

<IPython.core.display.Javascript object>

In [49]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_7'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

There are 0 values in extract_to column.
CPU times: total: 4min 13s
Wall time: 4min 13s


<IPython.core.display.Javascript object>

#### Checking Remaining Missing Values in `known_for_1`

In [51]:
# Checking remaining missing values in known_for_1
print(
    f'There are {len(df[df["known_for_1"] == ""])} remaining missing values in column.'
)

There are 2241 remaining missing values in column.


<IPython.core.display.Javascript object>

#### Observations:
- Before we look to hard-coding more values into `known_for_dict`, let us proceed to search the other columns for missing values.

#### Extracting `known_for_1` from Other `info_` Columns

In [123]:
# List of columns to check
cols_list = [
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

# Column to extract to
extract_to = "known_for_1"

# For loop to extract value to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[extract_to] == "") & (df[column].notna())]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, "").strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

There are 130909 values in extract_to column.


<IPython.core.display.Javascript object>

#### Checking Remaining Missing Values in `known_for_1`

In [125]:
# Checking remaining missing values in known_for_1
print(
    f'There are {len(df[df["known_for_1"] == ""])} remaining missing values in column.'
)

There are 1743 remaining missing values in column.


<IPython.core.display.Javascript object>

#### Observations:
- That iteration found ~ 500 missing values.
- Next we will do some hard-coding again.

#### Updating `known_for_dict`

In [126]:
# List of remaining values in info_2
list_to_check = df[df["known_for_1"] == ""]["info_2"].value_counts().index.to_list()

<IPython.core.display.Javascript object>

In [603]:
# Using pop to update known_for_dict
list_to_check.pop()

'helminthologist'

<IPython.core.display.Javascript object>

In [606]:
arts = arts + [
    "yidaki",
    "special effects",
    "oudist",
    "ceramics",
    "Playmate",
    '"snake king"',
    "magazine",
    "rock",
    "creator",
    "bookbinder",
    "pipe organ",
    "phone phreak",
    "Theatre",
    "cast member",
    "Editor",
    "special effects",
    "Amityville Horror",
    "Broadway",
    "Poet",
    "glass",
    "Wings",
    "Fruitcake Lady",
    "BBC",
    "dulcimer",
    "Literature",
    "archeology",
    "Opera",
    "wigmaker",
    "media",
    "cheesemaker",
    "vibraphonist",
    "drum",
    "celebrity",
    "publicity",
    "toastmaster",
    "CNN",
    "pageant queen",
    "pop queen",
    "pageant",
    "ITN News at Ten",
    "Bayreuth Festival",
    "Animation",
    "Studio",
    "puppet",
    "Gallery",
    "Drum",
    "singing",
    "didgeridoo",
    "decorative flag",
    "NBC",
    "CBS",
    "bureau chief",
    "recorder",
    "instrument",
    "conga",
    "fiddle",
]
business = business + [
    "website",
    "E & J Gallo Winery",
    "Motel",
    "Häagen Dazs",
    "Corporation",
    "Earthlink",
    "retail",
    "builder",
    "Restaurant",
    "restaurant" "tycoon",
    "Air Tahiti Nui",
    "LexisNexis",
    "media magnate",
    "Capricorn Records",
    "Michelin",
    "salesman",
    "General Motors",
    "millionaire",
    "clothing magnate",
    "toymaker",
    "freenode",
    "CHC Helicopter",
    "Pullman Company",
    "Microsoft",
    "labor arbitrator",
    "Takefuji",
    "Ford",
    "Sullivan Bluth" "Benihana",
    "Jaguar Land Rover",
    "Industries",
    "proprietor",
    "Hardee",
    "Movado Group",
    "Manufacturers Hanover",
    "Arcade Publishing",
    "Hertz",
    "magnate",
    "Voice",
    "Monster",
    "ARCO",
    "Stagecoach Group",
    "Portmeirion Pottery",
    "MGM",
    "Chemoil",
    "UAW",
    "Media Monitors",
    "Taillevent",
    "builder",
    "Fatburger",
    "theme park",
]
politics_govt_law = politics_govt_law + [
    "Minister of",
    "Police",
    "Deputy Minister",
    "Secretary",
    "Speaker",
    "Legislat",
    "State",
    "National Council",
    "Law Lord",
    "Courts",
    "Clerk",
    "Attorney",
    "parliament",
    "Labor Party",
    "House of Commons",
    "Kuomintang",
    "Minister for Sport",
    "sheriff",
    "Party",
    "Black Panther",
    "Black Consciousness",
    "Sinn Féin",
    "Public Works Minister",
    "National Farmers' Federation",
    "World Health Organization",
    "WHO",
    "negotiator for People Republic of",
    "Emir",
    "Civil Rights",
    "county",
    "campaign",
    "NAACP",
    "Premier",
    "Foreign Affairs",
    "General Court",
    "General Assembly",
    "fascist",
    "VHP",
    "Minister for Shipping" "Military",
    "Tuʻi Pelehake",
    "city council",
    "Assembly",
    "Māori queen",
    "Chief Minister",
    "Federal",
    "Chief of the AmaKhuze Tribe",
    "commissioner",
    "consul",
    "Lord Lieutenant",
    "Kyabazinga of Busoga",
    "qadi",
    "Fascist",
    "emir",
    "governor",
    "City Council",
    "High Commissioner",
    "Privacy Commissioner",
    "Cabinet Minister",
    "Teachta Dála",
    "negotiator",
    "law lord",
    "constitutionalist",
    "Provisional",
    "Attorney General",
    "headship",
    "Democra",
    "Office of Management",
    "Popular Front",
    "Social Security",
    "Grand Master of the Order of",
    "rights",
]
sciences = sciences + [
    "co developer",
    "periodontist",
    "disk drive",
    "agriculturalist",
    "CAD",
    "therapy",
    "periodontist",
    "flavorist",
    "disease expert",
    "Mac OS X Server expert",
    "programming",
    "head of research",
    "cartographer",
    "public health",
    "weather",
    "nursing",
    "anaesthetics",
    "bee expert",
    "Physiology",
    "Medicine",
    "transplantologist",
    "Clean Language",
    "phrenolog",
    "Counseling",
    "counseling",
]
academia_humanities = academia_humanities + [
    "preservationist",
    "Landmark Trust",
    "name expert",
    "literacy",
    "founder of Tennessee Temple University",
    "rector of Kelvinside Academy",
    "Studies",
    "Sovietologist",
    "Military Vehicle Technology Foundation",
    "mediaevalist",
    "Lenin Mausoleum",
    "dean",
    "Professor",
    "professor",
    "founder of the  Holocaust Memorial Museum",
]
event_record_other = event_record_other + [
    "recordholder",
    "widow of",
    "brother in law",
    "George Medal",
    "graduate",
    "Son of",
    "granddaughter of",
    "son in law",
    "billiards",
    "lottery winner",
    "mother of",
    "descendant of",
    "National Grandparents Day",
    "son of",
    "Mother of",
    "hermit",
    "Student",
    "who found the  TWA plane crash that killed Knute Rockne",
    " fan ",
    "member of the Vanderbilt family",
]
sports = sports + [
    "contract bridge",
    "NHL",
    "goal",
    "surfboard",
    "Rugby",
    "Rodeo",
    "NASCAR",
    "MLB",
    "Assistant Secretary of Manchester United",
    "Eagles",
    "middleweight",
    "NBA",
    "Wimbledon",
    "Giro d'Italia",
    "derby",
    "Stanley Cup",
    "paralympian",
    "Lucha Libre",
    "Celtics",
    "lineman",
    "linebacker",
    "former owner of the Philadelphia ers who signed Julius Erving",
    "curling",
    "race caller",
    "Yoga",
    "cross country",
    "running",
    "Red Sox",
    "hammer throw",
    "checkers",
    "Marathon",
    "marathon",
    "runner",
    "competitive eater",
    "World Rally",
    "judo",
    "surf",
    "powerlift",
    "ice player",
    "pistol shoot",
]
law_enf_military_operator = law_enf_military_operator + [
    "Secret Intelligence",
    "Watergate scandal principal",
    "Constable",
    "constable",
    "Minister for Defence",
    "Commodore",
    "RAF",
    "Shin Bet",
    "Veteran",
    "servicewoman",
    "al Qaeda",
    "small arms expert",
    "Garioch Pursuivant",
    "KGB",
    "Defence Force",
    "Defense Force",
    "bombardier",
    "FARC",
    "accident investigator",
    "investigator",
    "rebel",
]
spiritual = spiritual + [
    "Findhorn Foundation",
    "LDS",
    "Hasid",
    "Rabbi",
    "evangel",
    "Church",
    "Dean of York",
    "Bishop",
    "demonolog",
    "bhikkhu",
    "abbot",
    "christian",
    "motivational speaker",
    "fortune teller",
    "islam",
    "Ganden Tripa",
    "preacher",
    "Major",
    "Buddh",
    "Meditation" "Deacon",
    "Wicca",
    "wicca",
    "Unitarian",
]
social = social + [
    "Children Commissioner for",
    "Emmaüs",
    "peacemaker",
    "Twin Oaks",
    "Habitat for Humanity",
    "AIDS denialist",
    "Children World",
    "charity",
]
crime = crime + [
    "accused",
    "hijacked a plane",
    "member of the Birmingham Six",
    "assassin",
    "Mafia boss",
    "gang member",
    "kidnapper",
    "pretender",
    "member of the Detroit Partnership",
]
other_species = other_species + [
    "panda",
    "rhinoceros",
    "bull",
    "terrier",
    "Mouser",
    "tiger",
    "parrot",
    "giraffe",
    "orangutan",
]

<IPython.core.display.Javascript object>

#### Updating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [103]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "event_record_other": set(event_record_other),
    "crime": set(crime),
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "spiritual": set(spiritual),
    "other_species": set(other_species),
}

<IPython.core.display.Javascript object>

#### Extracting `known_for_1` from All `info_` Columns

In [123]:
# List of columns to check
cols_list = [
    'info_1',
    'info_2,
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

# Column to extract to
extract_to = "known_for_1"

# For loop to extract value to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[extract_to] == "") & (df[column].notna())]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, "").strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

There are 130909 values in extract_to column.


<IPython.core.display.Javascript object>

In [124]:
print("dunzo!")
chime.success()

dunzo!


<IPython.core.display.Javascript object>

In [602]:
df[df["info_2"] == "multi discipline pistol shooter"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7
14720,12,Huelet Benner,", 82, American multi-discipline pistol shooter.",https://en.wikipedia.org/wiki/Huelet_Benner,3,1999,December,,,multi discipline pistol shooter,,,,,,,,,,82.0,,United States of America,,,,,,,,


<IPython.core.display.Javascript object>

In [179]:
social

['philanthropist',
 'socialite',
 'philanthropy',
 'heir',
 'volunteer',
 'public figure',
 'humanitarian',
 'social worker']

<IPython.core.display.Javascript object>

In [459]:
[
    index
    for index in df[df["known_for_1"] == ""].index
    if "queen" in df.loc[index, "info"]
]

[725, 12963, 19148, 24428, 27744, 30880, 46767]

<IPython.core.display.Javascript object>

In [469]:
df.loc[46767, :]

day                                                                                       5
name                                                                        Mary Leona Gage
info              , 71, American pageant queen, stripped of Miss USA  title, heart failure.
link                                          https://en.wikipedia.org/wiki/Mary_Leona_Gage
num_references                                                                            5
year                                                                                   2010
month                                                                               October
info_parenth                                                                               
info_1                                                                                 None
info_2                                                                        pageant queen
info_3                                                             stripped of M

<IPython.core.display.Javascript object>

In [342]:
"archeologist" in sciences

True

<IPython.core.display.Javascript object>

In [414]:
"preacher" in spiritual

False

<IPython.core.display.Javascript object>

In [None]:
index for index in 

In [470]:
"pagaent" in arts

False

<IPython.core.display.Javascript object>

In [582]:
"pedagogue" in academia_humanities

False

<IPython.core.display.Javascript object>