# Wikipedia Notable Life Expectancies

# [Notebook 5 of : Data Cleaning](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean4_thanak_2022_06_23.ipynb)

## Context

The


## Objective

The

### Data Dictionary

- Feature: Description

## Importing Necessary Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean3.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean3", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 132652 rows and 23 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,economist,writer,and academic,,,,,,,,68.0,,Ireland,


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
132650,9,Oleg Moliboga,", 69, Russian volleyball player, Olympic champion and coach.",https://en.wikipedia.org/wiki/Oleg_Moliboga,2,2022,June,1980.0,,volleyball player,Olympic champion coach,,,,,,,,,69.0,,Russia,
132651,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Chinese Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
61263,23,Jim Mackonochie,", 52, British Royal Navy officer and video game developer, liver cancer.",https://en.wikipedia.org/wiki/Jim_Mackonochie,3,2013,April,,,Royal Navy officer video game developer,liver cancer,,,,,,,,,52.0,,United Kingdom of Great Britain and Northern Ireland,United Kingdom of Great Britain and Northern Ireland
108912,12,Nikitas Venizelos,", 89, Greek businessman .",https://en.wikipedia.org/wiki/Nikitas_Venizelos,3,2020,February,"Venizelos SA politician, MP 1974 1981, 1993 1996 Deputy Speaker 1993 1996",,businessman,,,,,,,,,,89.0,,Greece,
120636,22,Sagar Sarhadi,", 87, Indian screenwriter and film producer, heart disease.",https://en.wikipedia.org/wiki/Sagar_Sarhadi,8,2021,March,", ,",,screenwriter film producer,heart disease,,,,,,,,,87.0,,India,
48315,8,Elaine Crowley,", 83, Irish author.",https://en.wikipedia.org/wiki/Elaine_Crowley_(author),6,2011,February,,,author,,,,,,,,,,83.0,,Ireland,
17482,9,Vincent Alo,", 96, American mobster.",https://en.wikipedia.org/wiki/Vincent_Alo,2,2001,March,,,mobster,,,,,,,,,,96.0,,United States of America,


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132652 entries, 0 to 132651
Data columns (total 23 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   day             132652 non-null  object 
 1   name            132652 non-null  object 
 2   info            132652 non-null  object 
 3   link            132652 non-null  object 
 4   num_references  132652 non-null  object 
 5   year            132652 non-null  int64  
 6   month           132652 non-null  object 
 7   info_parenth    49830 non-null   object 
 8   info_1          35 non-null      object 
 9   info_2          132604 non-null  object 
 10  info_3          62571 non-null   object 
 11  info_4          12605 non-null   object 
 12  info_5          1497 non-null    object 
 13  info_6          216 non-null     object 
 14  info_7          31 non-null      object 
 15  info_8          6 non-null       object 
 16  info_9          1 non-null       object 
 17  info_10   

<IPython.core.display.Javascript object>

#### Loading `nation_map` from Pickle File to Dictionary nation_map

In [6]:
# Load the nation_map
with open("nation_map.pkl", "rb") as f:
    nation_map = pickle.load(f)

<IPython.core.display.Javascript object>

#### Loading `other_species` list from other_species.csv

In [7]:
# Loading other_species list
other_species_df = pd.read_csv("other_species.csv")
other_species = other_species_df["species"].tolist()
other_species = list(
    set(other_species)
)  # To avoid searching duplicates as we have been adding back to same csv file

<IPython.core.display.Javascript object>

#### Observations:
- With our dataframe, `nation_map`, and `other_species` list loaded, we can proceed to extracting the other features.
- First, we will clean up the `info_` columns by removing any remaining digits and nationality and country values.
- We will use the same functions from previous notebooks.

#### Function to Save Indices of Rows Matching Regular Expressions Pattern to a List and Print Number of Rows with Match 

In [8]:
# Define a function that takes dataframe, column name, and re pattern as arguments and returns list of indices
# for which column value matches re pattern
def rows_with_pattern(dataframe, column, pattern):
    """
    Takes input of dataframe, column name, and re pattern 
    and returns list of indices for rows that contain match
    for pattern anywhere within value for given column.
    
    dataframe: dataframe
    column: column name
    pattern: re pattern
    """
    index_list = []

    for i in dataframe.index:
        item = dataframe.loc[i, column]
        match = re.search(pattern, item)
        if match:
            index_list.append(i)
    print(
        f"There are {len(index_list)} rows with matching pattern in column '{column}'."
    )
    return index_list

<IPython.core.display.Javascript object>

#### Function to Use rows_with_pattern Function for Multiple Regular Expression Patterns

In [9]:
# Define a function that calls rows_with_pattern function for multiple re patterns
# returning a single list of indices for all rows with any pattern match


def multiple_patterns(dataframe, column, patterns):
    """
    Takes input dataframe, column, and list of re patterns and returns single list 
    of indices for rows in which a match for any pattern is found with re.search
    
    dataframe: dataframe
    column: column name
    patterns: list of re patterns
    """
    rows_combined = []

    # For loop to check each pattern
    for pattern in patterns:

        # List and number of rows matching each pattern
        print(pattern)
        rows_to_check = rows_with_pattern(dataframe, column, pattern)
        print("")

        # Add list for each pattern to combined list
        rows_combined += rows_to_check

    return rows_combined

<IPython.core.display.Javascript object>

### Removing Remaining Digits and Nationality/Country Values from `info_` Columns

#### List of Columns to Treat

In [10]:
# List of columns to treat
cols_lst = [
    "info_1",
    "info_2",
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

<IPython.core.display.Javascript object>

#### Removing Digits

In [11]:
# Regular expression for parenthesis and its contents
pattern = r"\d"

# For loop to find indices of rows that have pattern
rows_combined = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    rows_combined += rows_to_check

# Checking a sample of rows
df.loc[rows_combined, :].sample(2)

There are 0 rows with matching pattern in column 'info_1'.
There are 442 rows with matching pattern in column 'info_2'.
There are 2252 rows with matching pattern in column 'info_3'.
There are 1060 rows with matching pattern in column 'info_4'.
There are 69 rows with matching pattern in column 'info_5'.
There are 5 rows with matching pattern in column 'info_6'.
There are 0 rows with matching pattern in column 'info_7'.
There are 0 rows with matching pattern in column 'info_8'.
There are 0 rows with matching pattern in column 'info_9'.
There are 0 rows with matching pattern in column 'info_10'.
There are 0 rows with matching pattern in column 'info_11'.
There are 24403 rows with matching pattern in column 'info_parenth'.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
53903,12,Augustin Misago,", 69, Rwandan Roman Catholic prelate, Bishop of Gikongoro .",https://en.wikipedia.org/wiki/Augustin_Misago,3,2012,March,since 1992,,Roman Catholic prelate,Bishop of Gikongoro,,,,,,,,,69.0,,Rwanda,Italy
51251,29,Vera Popkova,", 68, Russian athlete, Olympic bronze medalist .",https://en.wikipedia.org/wiki/Vera_Popkova,9,2011,September,1968,,athlete,Olympic bronze medalist,,,,,,,,,68.0,,Russia,


<IPython.core.display.Javascript object>

In [12]:
# For loop to extract digits
for column in cols_lst:
    for index in set(rows_combined):
        item = df.loc[index, column]
        if item:
            match = re.search(pattern, item)
            if match:
                df.loc[index, column] = re.sub(pattern, "", item)

# Rechecking number and example rows after treatment
# For loop to find indices of rows that have pattern
recheck_rows = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    recheck_rows += rows_to_check

There are 0 rows with matching pattern in column 'info_1'.
There are 0 rows with matching pattern in column 'info_2'.
There are 0 rows with matching pattern in column 'info_3'.
There are 0 rows with matching pattern in column 'info_4'.
There are 0 rows with matching pattern in column 'info_5'.
There are 0 rows with matching pattern in column 'info_6'.
There are 0 rows with matching pattern in column 'info_7'.
There are 0 rows with matching pattern in column 'info_8'.
There are 0 rows with matching pattern in column 'info_9'.
There are 0 rows with matching pattern in column 'info_10'.
There are 0 rows with matching pattern in column 'info_11'.
There are 0 rows with matching pattern in column 'info_parenth'.


<IPython.core.display.Javascript object>

#### Removing Any Remaining Matches with  `nation_map` Keys and Values

In [13]:
%%time

# For loop to extract remaining information matching items in nation_map
for column in cols_lst:
    dataframe = df[df[column].notna()]
    for nationality, country in nation_map.items():
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if nationality + ' ' in item or country + ' ' in item or item.endswith(nationality) or item.endswith(country):
                    df.loc[index, column] = item.replace(nationality, "").strip().replace(country,'').strip()

CPU times: total: 14min 19s
Wall time: 14min 19s


<IPython.core.display.Javascript object>

#### Observations:
- After that bit of tidying, we can proceed to extracting `known_for_1` values.
- The bulk of these values should be in `info_2`, according to the Wikipedia defined fields, so we will start there.

## Extracting `known_for` Data
Our goal will be to have some broader categories into which the specific values will fit.  `known_for` is a diverse feature, in that an individual may be known for a long-term role or roles, a specific event, a relationship with another person who is famous, etc.  So, to some extent we will see what we find and adapt as we go.

Also, we will abandon searching left to right as an individual may fit more than one category, and in no particular order.  For example, Ronald Reagan, is entered as "American actor and politician".  He is most known as the 40th president of The United States, so if we prioritized the first value, he would fit only into the category containing actor.  At the same time, it may have been his acting career that led to his political career.  Both arenas are relevant, so we will aim to capture all categories for an individual.  Later, when there are duplicate categories for an indivual, we can remove the redundant values.

We will take the following approach:
1. create and check a list of unique values in `info_2` that have a minimum number repeated, sufficient to create sets for each category, but not so exhaustive to be time prohibitive to manually enter.
2. using the pop() method, add each role to it's associated category's set, below.
3. combine the sets for each category into one dictionary.
4. search for the values in the dictionary and extract the category key value to a new column `known_for_1`, `known_for_2`, etc.

In [14]:
# Obtaining values for column and their counts
col_values = df["info_2"].value_counts()

# Creating a list for values that occur more than set number of time
roles_list = [index for index in col_values.index if col_values[index] > 30]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

We will examine the top 447 unique values in info_2.


<IPython.core.display.Javascript object>

In [15]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

<IPython.core.display.Javascript object>

In [16]:
# Creating lists for each category
politics_govt_law = [
    "politic",
    "attorney",
    "Attorney" "unionist",
    "aristocrat",
    "diplomat",
    "lawyer",
    "activis",
    "rights",
    "federal",
    "judge",
    "royal",
    "civil",
    "jurist",
    "juror",
    "jury",
    "judge",
    "conserv",
    "govern",
    "barrister",
    "environmental",
    "King",
    "Queen",
    "Prince",
    "President",
    "Prime Minister",
    "leader",
    "Nazi",
    "Administrat",
    "Ambassador",
    "ambassador",
    "econom",
    "Econom",
]

arts = [
    "actor",
    "actress",
    "acting",
    "artist",
    "dance",
    "choreograph",
    "model",
    "televis",
    "jazz",
    "sing",
    "compose",
    "conduct",
    "journal",
    "write",
    "saxophon",
    "film",
    "comed",
    "poet",
    "edit",
    "drum",
    "produce",
    "song",
    "publish",
    "author",
    "violin",
    "rapper",
    "music",
    "animat",
    "pian",
    "comic",
    "fashion",
    "design",
    "guitar",
    "voice",
    "opera",
    "cinema",
    "playwright",
    "sculp",
    "novel",
    "photo",
    "architect",
    "paint",
    "disc jockey",
    "dj",
    "DJ",
    "MC",
    "bridge",
    "tenor",
    "trombon",
    "balle",
    "bass",
    "critic",
    "personality",
    "organist",
    "pipe organ",
    "lyric",
    "translat",
    "soprano",
    "cell",
    "broadcast",
    "chef",
    "literary",
    "ballet",
    "illustrat",
    "theatre",
    "theater",
    "trumpet",
    "present",
    "sportscast",
    "cartoon",
    "sportswrite",
    "choral",
    "Choral",
]
sports = [
    "footbal",
    "Olympic",
    "ski",
    "hockey",
    "soccer",
    "cricket",
    "sprint",
    "equestrian",
    "gymnast",
    "fenc",
    "chess",
    "wrestl",
    "swim",
    "basketball",
    "hurl",
    "sail",
    "row",
    "rugby",
    "Rugby" "athlet",
    "golf",
    "boxing",
    "boxer" "tennis",
    "cycli",
    "racing",
    "driv",
    "baseball",
    "rider",
    "speedway",
    "badminton",
    "sport shoot",
    "runner",
    "running" "umpire",
    "judoka",
    "volleyball",
    "track",
    "field",
    "bobsled",
    "canoe",
    "bodybuild",
    "skate",
    "curl",
    "Olympic div",
    "martial artist",
    "racer",
    "handball",
    "jumper",
    "racehorse trainer",
    "racecar driver",
    "hurdle",
    "polo",
    "Olympic shooter",
    "weightlift",
    "Baseball",
    "mountain",
    "jockey",
    "Olympic sports shooter",
    "referee",
    "general manager",
    "sport",
    "athlet",
    "ball",
    "NFL",
    "NHL",
    "MLB",
]
sciences = [
    "engineer",
    "physic",
    "geolog",
    "psychiatr",
    "botan",
    "biolog",
    "anthrop",
    "astron",
    "chemist",
    "scientist",
    "compute",
    "programm",
    "archaeolog",
    "psycholog",
    "sociolog",
    "math",
    "cosmonaut",
    "pediatric",
    "astron",
    "entomolog",
    "cardiolog",
    "doctor",
    "nurs",
    "immunolog",
    "meteorolog",
    "medical researcher",
    "medical",
    "medicine" "ornithol",
    "scientist",
    "zoolog",
    "geogr",
    "inventor",
    "genetic",
    "surgeon",
    "statistic",
    "science",
    "epidemiolog",
]

business = [
    "executive",
    "business",
    "bank",
    "entrepren",
    "real estate developer",
    "real estate",
    "restaurateur",
    "sports administrat",
    "industr",
]
academia_humanities = [
    "schol",
    "lingu",
    "educat",
    "philosoph",
    "academi",
    "military historian",
    "histor",
    "philolog",
    "librar",
    "profess",
    "Profess",
    "musicologist",
    "college",
    "university",
    "humanit",
    "professor",
    "Professor",
]
law_enf_military_operator = [
    "officer",
    "army",
    "Army",
    "police",
    "Police",
    "admiral",
    "soldier",
    "Air Force",
    "intelligence",
    "major",
    "Major",
    "lieutenant",
    "admiral",
    "pilot",
    "naval",
    "navy" "Navy",
    "aviat",
    "general",
    "CIA",
    "FBI",
    "law enforce",
    "milit",
    "Marine",
    "marine",
    "Coast Guard",
]
spiritual = [
    "rabbi",
    "Rabbi",
    "Catholic",
    "priest",
    "Anglican",
    "cardinal",
    "Deacon",
    "deacond",
    "theolog",
    "prelate",
    "Orthodox",
    "Episcopal",
    "bishop",
    "Bishop",
    "Jesuit",
    "hierarch",
    "Islam",
    "islam",
    "religious leader",
    "relig",
    "Church",
    "church",
    "Christ",
    "christ",
]
social = ["philanthrop", "socialite", "social work"]
crime = [
    "serial killer",
    "murderer",
    "convicted",
    "mobster",
    "criminal",
    "suspect",
    "guilty",
    "inmate",
]
event_record_other = ["victim", "survivor", "centenarian", "centarian", "oldest"]
other_species.append("Tree")

<IPython.core.display.Javascript object>

#### Observations:
- We have a good start on `known_for_1` values for which to search.  Some other roles that have been observed previously we have added to the list also.
- Note that roles such as sportswriter and sports broadcaster, though associated with sports, are also included in arts, to align with the underlying nature of the work itself.
- Let us combine them into one dictionary, taking care to put arts last to avoid missing values for "martial artist" and to put spiritual before politics_govt_law so that "leader" in politics_govt_law comes after "religious leader" in relgion.  Likewise "general manager" in sports will come before "general" in law_enf_military_operator and "military historian" in academia_humanities will come before "military" in "law_enf_military_operator".
- We will also include an other_species category here, again putting it last so that trainer and breeder in sports, come before racehorse in other_species.
- Then, we can proceed to extract the category to a new column, `known_for_1`.

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [17]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "event_record_other": set(event_record_other),
    "crime": set(crime),
    "social": set(social),
    "spiritual": set(spiritual),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
}

<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_1` Column from `info_1`

In [18]:
# Initializing known_for_1 column
df["known_for_1"] = ""

<IPython.core.display.Javascript object>

In [19]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 27 values in extract_to column.
CPU times: total: 1.53 s
Wall time: 1.53 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1
12317,25,Sir Robin Brook,", 90 British businessman, banker and Olympic fencer.",https://en.wikipedia.org/wiki/Robin_Brook,3,1998,October,,man,banker Olympic fencer,,,,,,,,,,90.0,,United Kingdom of Great Britain and Northern Ireland,,business
6832,17,Spiro Agnew,", American politician, 77, 39th Vice President of the United States, leukemia.",https://en.wikipedia.org/wiki/Spiro_Agnew,207,1996,September,,ian,,th Vice President of the,leukemia,,,,,,,,77.0,,United States of America,,politics_govt_law


<IPython.core.display.Javascript object>

#### Observations:
- Once again, the `info_1` column has provided a small sample on which to test our code, which appears to be working.
- We can move on to extracting additional `known_for` values in `info_1` to `known_for_2`.
- Sir Robin Brook is a good example of an individual who would have 3 categories with our approach--business, business, and sports.  So, we will have enough `known_for` columns to extract all values for all entries.  Removing these values has the added benefit of simplifying the next search for `cause_of_death`.

#### Extracting Category to `known_for_2` Column from `info_1`

In [20]:
# Initializing known_for_2 column
df["known_for_2"] = ""

<IPython.core.display.Javascript object>

In [21]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 11 values in extract_to column.
CPU times: total: 2.69 s
Wall time: 2.68 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2
59485,24,Kristján Jóhannsson,"83, Icelandic Olympic athlete.",https://en.wikipedia.org/wiki/Kristj%C3%A1n_J%C3%B3hannsson_(athlete),2,2013,January,,e,,,,,,,,,,,83.0,,Iceland,,sports,sports
104434,12,Franz Eisl,", 98. Austrian Olympic sailor .",https://en.wikipedia.org/wiki/Franz_Eisl,1,2019,July,",",or,,,,,,,,,,,98.0,,Austria,,sports,sports


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_3` Column from `info_1`

In [22]:
# Initializing known_for_2 column
df["known_for_3"] = ""

<IPython.core.display.Javascript object>

In [23]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

There are 0 values in extract_to column.
CPU times: total: 2.59 s
Wall time: 2.6 s


<IPython.core.display.Javascript object>

In [24]:
# Checking remaining unique values in info_1
df["info_1"].value_counts()

                        6
er                      3
ian                     2
or                      2
early                   2
automotive              1
basket coach            1
basket player           1
boxer                   1
mer                     1
er r                    1
e                       1
common chimpanzee       1
materials               1
Jr                      1
ional er                1
aka                     1
Jules Engel             1
s ist                   1
of                      1
social ist              1
man                     1
r                       1
gridiron foot player    1
Tree of the Year        1
Name: info_1, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We have extracted all of the `known_for` information present in `info_1`.
- It is time to proceed with extracting the same from the remaining `info_` columns.

#### Extracting Category to `known_for_1` Column from Remaining `info_` Columns

In [25]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 124187 values in extract_to column.
CPU times: total: 2min 55s
Wall time: 2min 55s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3
26240,18,Cy Coleman,", 76, American composer of Broadway musicals, heart attack.",https://en.wikipedia.org/wiki/Cy_Coleman,20,2004,November,,,composer of Broadway als,heart attack,,,,,,,,,76.0,,United States of America,,arts,,
113169,5,Bob Reade,", 87, American college football player .",https://en.wikipedia.org/wiki/Bob_Reade,3,2020,July,"Cornell College coach Geneseo High School, Augustana College",,football player,,,,,,,,,,87.0,,United States of America,,academia_humanities,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_2` Column from Remaining `info_` Columns

In [26]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 51108 values in extract_to column.
CPU times: total: 5min 40s
Wall time: 5min 40s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3
91307,25,Bobby Knutt,", 71, British actor and comedian , heart attack.",https://en.wikipedia.org/wiki/Bobby_Knutt,3,2017,September,", ,",,ian,heart attack,,,,,,,,,71.0,,United Kingdom of Great Britain and Northern Ireland,,arts,arts,
36345,27,Hal Stein,", 79, American jazz musician.",https://en.wikipedia.org/wiki/Hal_Stein,0,2008,April,,,ian,,,,,,,,,,79.0,,United States of America,,arts,arts,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_3` Column from Remaining `info_` Columns

In [27]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 14418 values in extract_to column.
CPU times: total: 6min 53s
Wall time: 6min 54s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3
117007,6,Klaus Ofczarek,", 81, Austrian actor and opera singer.",https://en.wikipedia.org/wiki/Klaus_Ofczarek,2,2020,December,",",,er,,,,,,,,,,81.0,,Austria,,arts,arts,arts
123793,7,Keshav Dutt,", 95, Indian field hockey player, Olympic champion .",https://en.wikipedia.org/wiki/Keshav_Dutt,18,2021,July,",",,player,champion,,,,,,,,,95.0,,India,,sports,sports,sports


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_4` Column from Remaining `info_` Columns

In [28]:
# Initializing known_for_4 column
df["known_for_4"] = ""

<IPython.core.display.Javascript object>

In [29]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 2859 values in extract_to column.
CPU times: total: 7min 30s
Wall time: 7min 30s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4
60617,22,Vladimír Čech,", 61, Czech actor, television presenter and politician, colorectal cancer and pneumonia.",https://en.wikipedia.org/wiki/Vladim%C3%ADr_%C4%8Cech,6,2013,March,,,,ion er ian,colorectal cancer pneumonia,,,,,,,,61.0,,Czech Republic,,arts,politics_govt_law,arts,arts
83598,6,Joani Blank,", 79, American entrepreneur , Butterfly vibrator inventor, author and feminist sex educator.",https://en.wikipedia.org/wiki/Joani_Blank,14,2016,August,Good Vibrations,,eur,Butterfly vibrator,feminist sex or,,,,,,,,79.0,,United States of America,,business,sciences,academia_humanities,arts


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_5` Column from Remaining `info_` Columns

In [30]:
# Initializing known_for_5 column
df["known_for_5"] = ""

<IPython.core.display.Javascript object>

In [31]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 510 values in extract_to column.
CPU times: total: 7min 30s
Wall time: 7min 30s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5
24034,31,Robert Guenette,", 68, American film/television producer, screenwriter, and film/television director, brain tumor.",https://en.wikipedia.org/wiki/Robert_Guenette,5,2003,October,,,ion r,screenr,and film ion director,brain tumor,,,,,,,68.0,,United States of America,,arts,arts,arts,arts,arts
119180,6,Afonso Fioreze,", 78, Brazilian Roman Catholic prelate, coadjutor bishop of Luziânia.",https://en.wikipedia.org/wiki/Afonso_Fioreze,2,2021,February,,,,coautor of Luziânia,,,,,,,,,78.0,,Brazil,Italy,spiritual,spiritual,spiritual,arts,spiritual


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_6` Column from Remaining `info_` Columns

In [32]:
# Initializing known_for_6 column
df["known_for_6"] = ""

<IPython.core.display.Javascript object>

In [33]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]  

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 67 values in extract_to column.
CPU times: total: 7min 21s
Wall time: 7min 21s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6
20303,26,Hugh Davis Graham,", 65, American historian, sociologist, civil rights scholar and author.",https://en.wikipedia.org/wiki/Hugh_Davis_Graham,2,2002,March,,,ian,ist,ar,,,,,,,,65.0,,United States of America,,academia_humanities,sciences,academia_humanities,politics_govt_law,politics_govt_law,arts
131693,23,Johnnie Jones,", 102, American soldier, civil rights lawyer and politician, member of the Louisiana House of Representatives .",https://en.wikipedia.org/wiki/Johnnie_Jones_(lawyer),22,2022,April,,,,ian,member of the House of Reatives,,,,,,,,102.0,,United States of America,,law_enf_military_operator,politics_govt_law,politics_govt_law,politics_govt_law,politics_govt_law,arts


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_7` Column from Remaining `info_` Columns

In [34]:
# Initializing known_for_7 column
df["known_for_7"] = ""

<IPython.core.display.Javascript object>

In [35]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_7'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 9 values in extract_to column.
CPU times: total: 7min 21s
Wall time: 7min 22s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7
45878,23,Feodosiy Petsyna,", 60, Ukrainian Orthodox prelate, archbishop of Drohobych and Sambir in Ukrainian Orthodox Church of the Kyivan Patriarchate , diabetes mellitus.",https://en.wikipedia.org/wiki/Feodosiy_Petsyna,0,2010,July,Autocephalous since,,,arch of Drohobych Sambir in of the Kyivan Patriarchate,diabetes mellitus,,,,,,,,60.0,,Ukraine,,spiritual,spiritual,spiritual,spiritual,spiritual,spiritual,spiritual
21233,16,Edith Addams,", 95, Belgian-American Olympic fencer , costume designer for theater, ballet and film and a theatrical producer.",https://en.wikipedia.org/wiki/Edith_Addams,2,2002,August,,,er,costume er for,et a theatrical r,,,,,,,,95.0,,Belgium,United States of America,sports,sports,arts,arts,sports,arts,arts


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_8` Column from Remaining `info_` Columns

In [36]:
# Initializing known_for_8 column
df["known_for_8"] = ""

<IPython.core.display.Javascript object>

In [37]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_8'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!='']

There are 1 values in extract_to column.
CPU times: total: 7min 19s
Wall time: 7min 19s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8
24461,4,Jeff Nuttall,", 70, English poet, publisher, actor, painter, sculptor, jazz trumpeter, social commentator and author.",https://en.wikipedia.org/wiki/Jeff_Nuttall,4,2004,January,,,,er,,er,tor,er,social commentator,,,,70.0,,United Kingdom of Great Britain and Northern Ireland,,arts,arts,arts,arts,arts,arts,arts,arts


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_9` Column from Remaining `info_` Columns

In [38]:
# Initializing known_for_9 column
df["known_for_9"] = ""

<IPython.core.display.Javascript object>

In [39]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_9'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!='']

There are 0 values in extract_to column.
CPU times: total: 7min 24s
Wall time: 7min 24s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9


<IPython.core.display.Javascript object>

In [40]:
# Checking the number of missing values in known_for_1
print(
    f'There are {len(df[df["known_for_1"] == ""])} missing values in known_for_1 column.'
)

There are 8465 missing values in known_for_1 column.


<IPython.core.display.Javascript object>

#### Observations:
- We have exhuasted our search using the current version of `known_for_dict` and have under 10,000 remaining missing values in `known_for_1`.
- Let us examine some of the remaining unique values in `info_2` and update our lists and dictionary.

#### Checking Remaining `info_2` Values

In [42]:
# Obtaining values for column and their counts
col_values = df[df["known_for_1"] == ""]["info_2"].value_counts()

# Creating a list for values that occur more than set number of time
roles_list = [index for index in col_values.index if col_values[index] > 2]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

We will examine the top 476 unique values in info_2.


<IPython.core.display.Javascript object>

In [43]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

<IPython.core.display.Javascript object>

#### Updating Category Lists for `known_for_dict`

In [44]:
# Appending category lists
sciences = [
    "volcanol",
    "gerontol",
    "pollster",
    "genealog",
    "software",
    "video game developer",
    "anaesthet",
    "geomorph",
    "carcinol",
    "weather",
    "aerodynamic",
    "limnolog",
    "control theorist",
    "patholog",
    "optometr",
    "neurolog",
    "endocrin",
    "anesthes",
    "obstetric",
    "zookeep",
    "game developer",
    "forest",
    "embryolog",
    "urolog",
    "arachnolog",
    "lichenolog",
    "anatom",
    "mineral",
    "enterolog",
    "sexolog",
    "bacteriolog",
    "gynecolog",
    "horticultur",
    "seismolog",
    "parasitolog",
    "physiolog",
    "primatolog",
    "hydrolog",
    "indolog",
    "etholog",
    "herbalis",
    "econometric",
    "cryptograph",
    "toxicolog",
    "haematolog",
    "hematolog",
    "ecolog",
    "ufolog",
    "crystallograph",
    "gynaecolog",
    "climatolog",
    "glaciolog",
    "demograph",
    "dentist",
    "archeolog",
    "ichthyolog",
    "nephrolog",
    "dermatolog",
    "veterinar",
    "physiolog",
    "horticult",
    "cancer research",
    "urban plan",
    "nutrition",
    "pharmac",
    "oncolog",
    "metallurg",
    "herpetolog",
    "ophthalmolog",
    "palaeontolog",
    "oceanograph",
    "agronom",
    "paediatric",
    "mycolog",
    "naturalis",
    "criminolog",
    "epidemiolog",
    "psychotherap",
    "neurolog",
    "paleontolog",
    "virolog",
    "psychoanalys",
    "wildlife",
    "biotech",
    "technolog",
]
politics_govt_law = [
    "justice",
    "anarch",
    "secretary",
    "partisan",
    "resist",
    "Resist",
    "policy",
    "chieftain",
    "communist",
    "Trotsky",
    "herald",
    "campaign",
    "prince",
    "insurgen",
    "detainee",
    "revolution",
    "elder",
    "Governor",
    "govern",
    "Vice",
    "peer",
    "landowner",
    "union",
    "sultan",
    "Sultan",
    "Senat",
    "represent",
    "Represent",
    "loyal",
    "Supreme Court",
    "Justice",
    "Chief Justice",
    "Conservative",
    "conservative",
    "Liberal",
    "liberal",
    "MP",
    "parliament" "pariliament",
    "Parliament",
    "colonial",
    "mayor",
    "Mayor",
    "ruler",
    "republican",
    "Republican",
    "Democrat",
    "democrat",
    "bureaucrat",
    "conspiracy theorist",
    "jihadist",
    "whistleblower",
    "prime minister",
    "count",
    "District",
    "Judge",
    "foreign minister",
    "Foreign Minister",
    "peeress",
    "legislator",
    "first lady",
    "First Lady",
    "courtier",
    "senior",
    "monarch",
    "statesman",
    "lobbyist",
    "solicitor",
    "senator",
    "representative",
    "nationalist",
    "protester",
    "noble",
    "prosecutor",
    "magistrate",
    "public official",
    "feminist",
    "dissident",
    "candidate",
    "congress",
    "administrator",
    "president",
    "Politician",
    "emir",
    "Emir",
]
law_enf_military_operator = [
    "veteran",
    "Veteran" "forester",
    "Navajo code talker",
    "security",
    "fighter",
    "paramilitary",
    "guerrilla",
    "rebel",
    "fighter ace",
    "flying ace",
    "firefighter",
    "Medal of Honor",
    "secret agent",
    "codebreaker",
    "Special Operations",
    "warlord",
    "Victoria Cross",
    "mercenary",
    "World War II",
    "colonel",
    "Marine",
    "Secret Service",
    "commander",
    "Air Chief",
    "Marshal",
    "marshal",
    "aviation",
    "airman",
    "spy",
]
sports = [
    "sport",
    "jumper",
    "athletic",
    "shot putter",
    "Olympian",
    "fencing",
    "bandy",
    "Banty",
    "rodeo",
    "rowing",
    "lacrosse",
    "yoga",
    "futsal",
    "heavyweight",
    "Heavyweight",
    "balloonist",
    "racewalker",
    "hurling",
    "biker",
    "scuba",
    "master of the horse",
    "shogi",
    "Football",
    "softball",
    "free diver",
    "greyhound trainer",
    "goalkeeper",
    "mountain",
    "boxing",
    "hunter",
    "angler",
    "aikidoka",
    "aikido",
    "cave diver",
    "alpinist",
    "powerlifter",
    "karate",
    "rowing",
    "coxswain",
    "skater",
    "skating",
    "Go player",
    "orienteer",
    "ten pin",
    "karateka",
    "wrestling",
    "announcer",
    "golf",
    "netball",
    "poker",
    "slalom",
    "canoe",
    "pool player",
    "NFL",
    "CFL",
    "CFL",
    "bowl",
    "pole vault",
    "strongman",
    "yachtsman",
    "snowboard",
    "skateboard",
    "archer",
    "swim",
    "squash",
    "climber",
    "climb",
    "shot put",
    "luger",
    "walker",
    "walk",
    "adventurer",
    "diver",
    "surfer",
    "surf",
    "explorer",
    "explore" "bullfighter",
    "sprint",
    "pitcher",
    "snooker",
    "rejoneador",
    "matador",
]

academia_humanities = [
    "Esperantist",
    "phonetician",
    "vexillologist",
    "Byzantinist",
    "logician",
    "Turkologist",
    "bioethicist",
    "Mayanist",
    "Hellenist",
    "crossword compiler",
    "cruciverbalist",
    "Hispanist",
    "Arabist",
    "semiotician",
    "Assyriologist",
    "literary theorist",
    "schoolmaster",
    "schoolteacher",
    "intellectual",
    "organizational theorist",
    "information theorist",
    "orientalist",
    "medievalist",
    "classicist",
    "archivist",
    "museum",
    "numismatist",
    "ethnologist",
    "lexicographer",
    "folklorist",
    "philatelist",
    "sinologist",
    "teacher",
    "Egyptologist",
    "Japanologist",
    "Iranologist",
    "Indologist",
]
business = [
    "retailer",
    "grocer",
    "auctioneer",
    "baker",
    "car dealer",
    "clothier",
    "food manufacturer",
    "manufacturer",
    "real estate",
    "shipowner",
    "company director",
    "distiller",
    "financial",
    "finance",
    "media owner",
    "printer",
    "management consultant",
    "investment manager",
    "vintner",
    "brewer",
    "jeweller",
    "magnate",
    "nightclub owner",
    "bookseller",
    "billionaire",
    "stockbroker",
    "farmer",
    "hotel",
    "accountant",
    "property developer",
    "investor",
    "financier",
    "winemaker",
]
crime = [
    "murder suspect",
    "suspect",
    "concentration camp guard",
    "drug dealer",
    "drug lord",
    "convict",
    "drug trafficker",
    "spree killer",
    "gangster",
    "mafia",
    "mob",
    "sex offender",
]
spiritual = [
    "Presbyterian",
    "spiritual",
    "Zen",
    "Buddhist",
    "monk",
    "ayatollah",
    "Ayatollah",
    "psychic",
    "yogi",
    "Marja",
    "Trappist",
    "Christian",
    "missionary",
    "Benedictine",
    "nun",
    "faith",
    "healer",
    "Methodist",
    "archdeacon",
    "Baptist",
    "cleric",
    "televangelist",
    "clergy",
    "astrolog",
    "evangelist",
    "minister",
    "pastor",
    "lama",
    "imam",
]
arts = [
    "milliner",
    "memoirist",
    "columnist",
    "bluegrass",
    "fiddler",
    "perfumer",
    "performer",
    "acting",
    "organ builder",
    "art patron",
    "TV",
    "reporter",
    "Pulitzer Prize",
    "script",
    "santoor",
    "mandolin",
    "oenologist",
    "radio",
    "host",
    "horn player",
    "cameraman",
    "tuba",
    "surfboard shaper",
    "impresario",
    "weaver",
    "oud player",
    "blues",
    "reporter",
    "animal trainer",
    "harmonica",
    "guitar",
    "movie",
    "woodworker",
    "R&B",
    "antique",
    "craftsman",
    "double bass",
    "keyboard",
    "drag queen",
    "trumpet",
    "hairstylist",
    "etiquette",
    "accordion",
    "radio",
    "mural",
    "Calypso",
    "calypso",
    "bassoon",
    "animation",
    "correspondent",
    "taekwondo",
    "potter",
    "studio",
    "illusionist",
    "magici",
    "circus",
    "documentar",
    "YouTube",
    "satirist",
    "beauty pageant",
    "baritone",
    "impressionist",
    "performer",
    "stunt",
    "hairdresser",
    "theatre",
    "announcer",
    "flutist",
    "flute",
    "clown",
    "harp",
    "bass player",
    "blog",
    "vlog",
    "show",
    "ventriloquist",
    "typographer",
    "calligrapher",
    "band manager",
    "tabla",
    "storyteller",
    "arranger",
    "news",
    "curator",
    "violist",
    "printmaker",
    "oboist",
    "sound",
    "beauty queen",
    "literary agent",
    "contralto",
    "ceramicist",
    "vocal",
    "ceramist",
    "banjo",
    "publicist",
    "flautist",
    "harpsichord",
    "decorator",
    "talent",
    "accordionist",
    "casting",
    "stage director",
    "theater",
    "humorist",
    "essayist",
    "biographer",
    "art collector",
    "puppeteer",
    "art dealer",
    "drama",
    "art director",
    "entertainer",
    "percussion",
    "clarinet",
    "director",
    "stage",
    "bandoneon",
    "choir",
    "Choir",
    "porn",
    "sarod",
    "instrument",
    "saxophon",
    "dialect coach",
]
social = [
    "heir",
    "volunteer",
    "public figure",
    "humanitarian",
    "social worker",
]
event_record_other = [
    "homeless",
    "student",
    "teenager",
    "fan of",
    "worker",
    "child",
    "boy",
    "girl",
    "employee",
    "longevity claimant",
    "record holder",
    "heaviest",
    "tallest",
    "shortest",
    "oldest",
    "youngest",
    "last",
    "first",
    "centenarian",
    "heaviest",
    "smallest",
    "muse",
]
other_species = other_species + [
    "elephant",
    "Great Dane",
    "greyhound",
    "thoroughbred",
]

<IPython.core.display.Javascript object>

#### Updating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [45]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "event_record_other": set(event_record_other),
    "crime": set(crime),
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "spiritual": set(spiritual),
}

<IPython.core.display.Javascript object>

#### Observations:
- Now we will repeat extracting `known_for` values from the remaining `info_` columns (`info_1` excluded) with the updated dictionary.

#### Extracting Category to `known_for_1` with Updated `known_for_dict`

In [46]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 124187 values in extract_to column.
CPU times: total: 57 s
Wall time: 57 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9
76375,6,Max Annett,", 84, Australian Olympic rower.",https://en.wikipedia.org/wiki/Max_Annett,1,2015,July,,,er,,,,,,,,,,84.0,,Australia,,sports,sports,,,,,,,
112627,14,Aarón Padilla Gutiérrez,", 77, Mexican footballer , COVID-19.",https://en.wikipedia.org/wiki/Aar%C3%B3n_Padilla_Guti%C3%A9rrez,1,2020,June,"Pumas, Atlante, national team",,footer,COVID,,,,,,,,,77.0,,Mexico,,sports,,,,,,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_2` with Updated `known_for_dict`

In [47]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 51108 values in extract_to column.
CPU times: total: 57.8 s
Wall time: 57.8 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9
26105,26,Fred Paine,", 78, American professional basketball player .",https://en.wikipedia.org/wiki/Fred_Paine,0,2004,October,Providence Steamrollers,,ional basket player,,,,,,,,,,78.0,,United States of America,,academia_humanities,sports,,,,,,,
51729,3,Tamás Eszes,", 47, Hungarian politician and paramilitary leader, suicide.",https://en.wikipedia.org/wiki/Tam%C3%A1s_Eszes_(politician),7,2011,November,,,ian paraary,suicide,,,,,,,,,47.0,,Hungary,,law_enf_military_operator,politics_govt_law,politics_govt_law,,,,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_3` with Updated `known_for_dict`

In [48]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_3']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 14418 values in extract_to column.
CPU times: total: 53.7 s
Wall time: 53.7 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9
14448,28,Howard Browne,", 91, American science fiction editor and mystery writer.",https://en.wikipedia.org/wiki/Howard_Browne,2,1999,October,,,fiction or mystery r,,,,,,,,,,91.0,,United States of America,,sciences,arts,arts,,,,,,
36152,7,Phil Urso,", 82, American jazz tenor saxophonist and composer.",https://en.wikipedia.org/wiki/Phil_Urso,1,2008,April,,,ist r,,,,,,,,,,82.0,,United States of America,,arts,arts,arts,arts,,,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_4` with Updated `known_for_dict`

In [49]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_4']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 2859 values in extract_to column.
CPU times: total: 52.6 s
Wall time: 52.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9
119374,11,Rubén Alfonso Ramírez,", 84, Guatemalan television presenter and politician, minister of education .",https://en.wikipedia.org/wiki/Rub%C3%A9n_Alfonso_Ram%C3%ADrez,1,2021,February,,,ion er ian,minister of ion,,,,,,,,,84.0,,Guatemala,,politics_govt_law,arts,arts,academia_humanities,,,,,
46848,11,Georges Rutaganda,", 51, Rwandan Hutu paramilitary leader, convicted war criminal, after long illness.",https://en.wikipedia.org/wiki/Georges_Rutaganda,6,2010,October,,,Hutu paraary,war,after long illness,,,,,,,,51.0,,Rwanda,,law_enf_military_operator,politics_govt_law,crime,crime,,,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_5` with Updated `known_for_dict`

In [50]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_5']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 510 values in extract_to column.
CPU times: total: 52.6 s
Wall time: 52.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9
21880,25,Ed Bliss,", 90, American broadcast journalist, news editor and educator .",https://en.wikipedia.org/wiki/Ed_Bliss,15,2002,November,"Edward R Mur, Walter Cronkite",,ist,news or or,,,,,,,,,90.0,,United States of America,,arts,arts,academia_humanities,arts,sports,,,,
60989,11,Don Blackman,", 59, American jazz-funk pianist, singer and songwriter, cancer.",https://en.wikipedia.org/wiki/Don_Blackman,3,2013,April,,,funk ist,er r,cancer,,,,,,,,59.0,,United States of America,,arts,arts,arts,arts,arts,,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_6` with Updated `known_for_dict`

In [51]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_6']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 67 values in extract_to column.
CPU times: total: 54.3 s
Wall time: 54.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9
86887,31,John Wetton,", 67, British singer-songwriter , colorectal cancer.",https://en.wikipedia.org/wiki/John_Wetton,25,2017,January,"""Only Time Will Tell"", ""Heat of the Moment"" ist Asia, Crimson",,er r,colorectal cancer,,,,,,,,,67.0,,United Kingdom of Great Britain and Northern Ireland,,arts,arts,arts,politics_govt_law,arts,arts,,,
33094,15,Yolanda King,", 51, American activist and actress, daughter and first-born child of civil rights leader Martin Luther King Jr.",https://en.wikipedia.org/wiki/Yolanda_King,131,2007,May,,,t,daughter first child of Martin Luther Jr,,,,,,,,,51.0,,United States of America,,politics_govt_law,arts,politics_govt_law,politics_govt_law,politics_govt_law,politics_govt_law,,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_7` with Updated `known_for_dict`

In [52]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_7'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_7']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 9 values in extract_to column.
CPU times: total: 52.4 s
Wall time: 52.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,known_for_1,known_for_2,known_for_3,known_for_4,known_for_5,known_for_6,known_for_7,known_for_8,known_for_9
21679,27,Baby Lloyd Stallworth,", 61, American singer, dancer, songwriter, musician, choreographer, and recording artist, complications of diabetes.",https://en.wikipedia.org/wiki/Baby_Lloyd_Stallworth,24,2002,October,,,er,r,r,ian,er,and recording,complications of diabetes,,,,61.0,,United States of America,,arts,arts,arts,arts,arts,arts,arts,,
74489,24,Louis Renner,", 88, American Jesuit priest, historian and academic , specialist in Catholic history in Alaska.",https://en.wikipedia.org/wiki/Louis_Renner,4,2015,March,University of Alaska Fairs,,,ian c,specialist in y in Alaska,,,,,,,,88.0,,United States of America,,spiritual,spiritual,academia_humanities,academia_humanities,spiritual,academia_humanities,business,,


<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_8` with Updated `known_for_dict`

In [53]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_8'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_8']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 1 values in extract_to column.


ValueError: Cannot take a larger sample than population when 'replace=False'

<IPython.core.display.Javascript object>

#### Extracting Category to `known_for_9` with Updated `known_for_dict`

In [54]:
%%time

# Columns to check
cols_list = [
    'info_2',
    'info_3',
    'info_4',
    'info_5',
    'info_6',
    'info_7',
    'info_8',
    'info_9',
    'info_10',
    'info_11',
    'info_parenth'
]

# Extract to column
extract_to = 'known_for_9'

# For loop to find role in column and extract it as category to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df['known_for_9']!= '') & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

There are 0 values in extract_to column.


ValueError: a must be greater than 0 unless no samples are taken

<IPython.core.display.Javascript object>

#### Checking Remaining Missing Values in `known_for_1`

In [55]:
# Checking remaining missing values in known_for_1
print(
    f'There are {len(df[df["known_for_1"] == ""])} remaining missing values in column.'
)

There are 8465 remaining missing values in column.


<IPython.core.display.Javascript object>

#### Observations:
- Before we look to hard-coding more values into `known_for_dict`, let us proceed to search the other columns for missing values.

#### Extracting `known_for_1` from Other `info_` Columns

In [None]:
# List of columns to check
cols_list = [
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

# Column to extract to
extract_to = "known_for_1"

# For loop to extract value to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[extract_to] == "") & (df[column].notna())]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, "").strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

#### Checking Remaining Missing Values in `known_for_1`

In [None]:
# Checking remaining missing values in known_for_1
print(
    f'There are {len(df[df["known_for_1"] == ""])} remaining missing values in column.'
)

#### Observations:
- That iteration found ~ 500 missing values.
- Next we will do some hard-coding again.

#### Updating `known_for_dict`

In [None]:
# List of remaining values in info_2
list_to_check = df[df["known_for_1"] == ""]["info_2"].value_counts().index.to_list()

In [None]:
# Using pop to update known_for_dict
list_to_check.pop()

In [None]:
arts = [
    "yidaki",
    "special effects",
    "oudist",
    "ceramics",
    "Playmate",
    '"snake king"',
    "magazine",
    "rock",
    "creator",
    "bookbinder",
    "pipe organ",
    "phone phreak",
    "Theatre",
    "cast member",
    "Editor",
    "special effects",
    "Amityville Horror",
    "Broadway",
    "Poet",
    "glass",
    "Wings",
    "Fruitcake Lady",
    "BBC",
    "dulcimer",
    "Literature",
    "archeology",
    "Opera",
    "wigmaker",
    "media",
    "cheesemaker",
    "vibraphonist",
    "drum",
    "celebrity",
    "publicity",
    "toastmaster",
    "CNN",
    "pageant queen",
    "pop queen",
    "pageant",
    "ITN News at Ten",
    "Bayreuth Festival",
    "Animation",
    "Studio",
    "puppet",
    "Gallery",
    "Drum",
    "singing",
    "didgeridoo",
    "decorative flag",
    "NBC",
    "CBS",
    "bureau chief",
    "recorder",
    "instrument",
    "conga",
    "fiddle",
]
business = [
    "website",
    "E & J Gallo Winery",
    "Motel",
    "Häagen Dazs",
    "Corporation",
    "Earthlink",
    "retail",
    "builder",
    "Restaurant",
    "restaurant",
    "tycoon",
    "Air Tahiti Nui",
    "LexisNexis",
    "media magnate",
    "Capricorn Records",
    "Michelin",
    "salesman",
    "General Motors",
    "millionaire",
    "clothing magnate",
    "toymaker",
    "freenode",
    "CHC Helicopter",
    "Pullman Company",
    "Microsoft",
    "labor arbitrator",
    "Takefuji",
    "Ford",
    "Sullivan Bluth",
    "Benihana",
    "Jaguar Land Rover",
    "Industries",
    "proprietor",
    "Hardee",
    "Movado Group",
    "Manufacturers Hanover",
    "Arcade Publishing",
    "Hertz",
    "magnate",
    "Voice",
    "Monster",
    "ARCO",
    "Stagecoach Group",
    "Portmeirion Pottery",
    "MGM",
    "Chemoil",
    "UAW",
    "Media Monitors",
    "Taillevent",
    "builder",
    "Fatburger",
    "theme park",
]
politics_govt_law = [
    "Minister of",
    "Police",
    "Deputy Minister",
    "Secretary",
    "Speaker",
    "Legislat",
    "State",
    "National Council",
    "Law Lord",
    "Courts",
    "Clerk",
    "Attorney",
    "parliament",
    "Labor Party",
    "House of Commons",
    "Kuomintang",
    "Minister for Sport",
    "sheriff",
    "Party",
    "Black Panther",
    "Black Consciousness",
    "Sinn Féin",
    "Public Works Minister",
    "National Farmers' Federation",
    "World Health Organization",
    "WHO",
    "negotiator for People Republic of",
    "Emir",
    "Civil Rights",
    "county",
    "campaign",
    "NAACP",
    "Premier",
    "Foreign Affairs",
    "General Court",
    "General Assembly",
    "fascist",
    "VHP",
    "Minister for Shipping",
    "Military",
    "Tuʻi Pelehake",
    "city council",
    "Assembly",
    "Māori queen",
    "Chief Minister",
    "Federal",
    "Chief of the AmaKhuze Tribe",
    "commissioner",
    "consul",
    "Lord Lieutenant",
    "Kyabazinga of Busoga",
    "qadi",
    "Fascist",
    "emir",
    "governor",
    "City Council",
    "High Commissioner",
    "Privacy Commissioner",
    "Cabinet Minister",
    "Teachta Dála",
    "negotiator",
    "law lord",
    "constitutionalist",
    "Provisional",
    "Attorney General",
    "headship",
    "Democra",
    "Office of Management",
    "Popular Front",
    "Social Security",
    "Grand Master of the Order of",
    "rights",
]
sciences = [
    "co developer",
    "periodontist",
    "disk drive",
    "agriculturalist",
    "CAD",
    "therapy",
    "periodontist",
    "flavorist",
    "disease expert",
    "Mac OS X Server expert",
    "programming",
    "head of research",
    "cartographer",
    "public health",
    "weather",
    "nursing",
    "anaesthetics",
    "bee expert",
    "Physiology",
    "Medicine",
    "transplantologist",
    "Clean Language",
    "phrenolog",
    "Counseling",
    "counseling",
]
academia_humanities = [
    "preservationist",
    "Landmark Trust",
    "name expert",
    "literacy",
    "founder of Tennessee Temple University",
    "rector of Kelvinside Academy",
    "Studies",
    "Sovietologist",
    "Military Vehicle Technology Foundation",
    "mediaevalist",
    "Lenin Mausoleum",
    "dean",
    "Professor",
    "professor",
    "founder of the  Holocaust Memorial Museum",
]
event_record_other = [
    "recordholder",
    "widow of",
    "brother in law",
    "George Medal",
    "graduate",
    "Son of",
    "granddaughter of",
    "son in law",
    "billiards",
    "lottery winner",
    "mother of",
    "descendant of",
    "National Grandparents Day",
    "son of",
    "Mother of",
    "hermit",
    "Student",
    "who found the  TWA plane crash that killed Knute Rockne",
    " fan ",
    "member of the Vanderbilt family",
]
sports = [
    "contract bridge",
    "NHL",
    "goal",
    "surfboard",
    "Rugby",
    "Rodeo",
    "NASCAR",
    "MLB",
    "Assistant Secretary of Manchester United",
    "Eagles",
    "middleweight",
    "NBA",
    "Wimbledon",
    "Giro d'Italia",
    "derby",
    "Stanley Cup",
    "paralympian",
    "Lucha Libre",
    "Celtics",
    "lineman",
    "linebacker",
    "former owner of the Philadelphia ers who signed Julius Erving",
    "curling",
    "race caller",
    "Yoga",
    "cross country",
    "running",
    "Red Sox",
    "hammer throw",
    "checkers",
    "Marathon",
    "marathon",
    "runner",
    "competitive eater",
    "World Rally",
    "judo",
    "surf",
    "powerlift",
    "ice player",
    "pistol shoot",
]
law_enf_military_operator = [
    "Secret Intelligence",
    "Watergate scandal principal",
    "Constable",
    "constable",
    "Minister for Defence",
    "Commodore",
    "RAF",
    "Shin Bet",
    "Veteran",
    "servicewoman",
    "al Qaeda",
    "small arms expert",
    "Garioch Pursuivant",
    "KGB",
    "Defence Force",
    "Defense Force",
    "bombardier",
    "FARC",
    "accident investigator",
    "investigator",
    "rebel",
]
spiritual = [
    "Findhorn Foundation",
    "LDS",
    "Hasid",
    "Rabbi",
    "evangel",
    "Church",
    "Dean of York",
    "Bishop",
    "demonolog",
    "bhikkhu",
    "abbot",
    "christian",
    "motivational speaker",
    "fortune teller",
    "islam",
    "Ganden Tripa",
    "preacher",
    "Major",
    "Buddh",
    "Meditation",
    "Deacon",
    "Wicca",
    "wicca",
    "Unitarian",
]
social = [
    "Children Commissioner for",
    "Emmaüs",
    "peacemaker",
    "Twin Oaks",
    "Habitat for Humanity",
    "AIDS denialist",
    "Children World",
    "charity",
]
crime = [
    "accused",
    "hijacked a plane",
    "member of the Birmingham Six",
    "assassin",
    "Mafia boss",
    "gang member",
    "kidnapper",
    "pretender",
    "member of the Detroit Partnership",
]
other_species = other_species + [
    "panda",
    "rhinoceros",
    "bull",
    "terrier",
    "Mouser",
    "tiger",
    "parrot",
    "giraffe",
    "orangutan",
]

#### Updating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "event_record_other": set(event_record_other),
    "crime": set(crime),
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "spiritual": set(spiritual),
}

#### Extracting `known_for_1` from All `info_` Columns

In [None]:
# List of columns to check
cols_list = [
    'info_1',
    'info_2,
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

# Column to extract to
extract_to = "known_for_1"

# For loop to extract value to extract_to column
for column in cols_list:
    for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[extract_to] == "") & (df[column].notna())]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, "").strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

In [56]:
print("dunzo!")
chime.success()

dunzo!


<IPython.core.display.Javascript object>

In [None]:
df[df["info_2"] == "multi discipline pistol shooter"]