# Wikipedia Notable Life Expectancies

# [Notebook 5 of : Data Cleaning](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean4_thanak_2022_06_23.ipynb)

## Context

The


## Objective

The

### Data Dictionary

- Feature: Description

## Importing Necessary Libraries

In [None]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [None]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean3.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean3", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

In [None]:
# Checking last 2 rows of the data
df.tail(2)

In [None]:
# Checking a sample of the data
df.sample(5)

### Checking Data Types, Duplicates, and Null Values

In [None]:
# Checking data types and null values
df.info()

#### Loading `nation_map` from Pickle File to Dictionary nation_map

In [None]:
# Load the nation_map
with open("nation_map.pkl", "rb") as f:
    nation_map = pickle.load(f)

#### Loading `other_species` list from other_species.csv

In [None]:
# Loading other_species list
other_species_df = pd.read_csv("other_species.csv")
other_species = other_species_df["species"].tolist()

#### Observations:
- With our dataframe, `nation_map`, and `other_species` list loaded, we can proceed to extracting the other features.
- First, we will clean up the divided `info` columns by removing any remaining digits and nationality and country values.
- We will use the same functions from previous notebooks.

#### Function to Save Indices of Rows Matching Regular Expressions Pattern to a List and Print Number of Rows with Match 

In [None]:
# Define a function that takes dataframe, column name, and re pattern as arguments and returns list of indices
# for which column value matches re pattern
def rows_with_pattern(dataframe, column, pattern):
    """
    Takes input of dataframe, column name, and re pattern 
    and returns list of indices for rows that contain match
    for pattern anywhere within value for given column.
    
    dataframe: dataframe
    column: column name
    pattern: re pattern
    """
    index_list = []

    for i in dataframe.index:
        item = dataframe.loc[i, column]
        match = re.search(pattern, item)
        if match:
            index_list.append(i)
    print(
        f"There are {len(index_list)} rows with matching pattern in column '{column}'."
    )
    return index_list

#### Function to Use rows_with_pattern Function for Multiple Regular Expression Patterns

In [None]:
# Define a function that calls rows_with_pattern function for multiple re patterns
# returning a single list of indices for all rows with any pattern match


def multiple_patterns(dataframe, column, patterns):
    """
    Takes input dataframe, column, and list of re patterns and returns single list 
    of indices for rows in which a match for any pattern is found with re.search
    
    dataframe: dataframe
    column: column name
    patterns: list of re patterns
    """
    rows_combined = []

    # For loop to check each pattern
    for pattern in patterns:

        # List and number of rows matching each pattern
        print(pattern)
        rows_to_check = rows_with_pattern(dataframe, column, pattern)
        print("")

        # Add list for each pattern to combined list
        rows_combined += rows_to_check

    return rows_combined

### Removing Remaining Digits and Nationality/Country Values from Divided `info` Columns

#### List of Columns to Treat

In [None]:
# List of columns to treat
cols_lst = [
    "info_1",
    "info_2",
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

#### Removing Digits

In [None]:
# Regular expression for parenthesis and its contents
pattern = r"\d"

# For loop to find indices of rows that have pattern
rows_combined = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    rows_combined += rows_to_check

# Checking a sample of rows
df.loc[rows_combined, :].sample(2)

In [None]:
# For loop to extract digits
for column in cols_lst:
    for index in set(rows_combined):
        item = df.loc[index, column]
        if item:
            match = re.search(pattern, item)
            if match:
                df.loc[index, column] = re.sub(pattern, "", item)

# Rechecking number and example rows after treatment
# For loop to find indices of rows that have pattern
recheck_rows = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    recheck_rows += rows_to_check

#### Removing Any Remaining Matches with  `nation_map` Keys and Values

In [None]:
%%time

# For loop to extract remaining information matching items in nation_map
for column in cols_lst:
    dataframe = df[df[column].notna()]
    for nationality, country in nation_map.items():
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if nationality in item or country in item:
                    df.loc[index, column] = item.replace(nationality, "").strip().replace(country,'').strip()

#### Observations:
- After that bit of tidying, we can proceed to extracting `known_for_1` values.
- The bulk of these values should be in `info_2`, according to the Wikipedia defined fields, so we will start there.

## Extracting `known_for` Data
Our goal will be to have some broader categories into which the specific values will fit.  `known_for` is a diverse feature, in that an individual may be known for a long-term role or roles, a specific event, a relationship with another person who is famous, etc.  So, to some extent we will see what we find and adapt as we go.

Also, we will abandon searching left to right as an individual may fit more than one category, and in no particular order.  For example, Ronald Reagan, is entered as "American actor and politician".  He is most known as the 40th president of The United States, so if we prioritized the first value, he would fit only into the category containing actor.  At the same time, it may have been his acting career that led to his political career.  Both arenas are relevant, so we will aim to capture all categories for an individual.  Later, when there are duplicate categories for an indivual, we can remove the redundant values.

We will take the following approach:
1. create and check a list of unique values in `info_2` that have a minimum number repeated, sufficient to create sets for each category, but not so exhaustive to be time prohibitive to manually enter.
2. using the pop() method, add each role to it's associated category's set, below.
3. combine the sets for each category into one dictionary.
4. search for the values in the dictionary and extract the category key value to a new column `known_for_1`, `known_for_2`, etc.

In [None]:
# Obtaining values for column and their counts
col_values = df["info_2"].value_counts()

# Creating a list for values that occur more than set number of time
roles_list = [index for index in col_values.index if col_values[index] > 30]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

In [None]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

In [None]:
# Creating lists for each category
politics_govt_law = [
    "politician",
    "economist",
    "attorney",
    "trade unionist",
    "unionist",
    "aristocrat",
    "diplomat",
    "lawyer",
    "activist",
    "civil rights",
    "federal",
    "judge",
    "political",
    "politics",
    "royal",
    "civil servant",
    "jurist",
    "judge",
    "conservationist",
    "government official",
    "government",
    "barrister",
    "militant",
    "environmentalist",
    "public servant",
    "King",
    "Queen",
    "Princess",
    "Prince",
    "President",
    "Prime Minister",
    "leader",
    "Nazi",
    "Administration",
    "Ambassador",
    "ambassador",
]

arts = [
    "actor",
    "dancer",
    "choreographer",
    "model",
    "television",
    "jazz",
    "singer",
    "composer",
    "conductor",
    "journalist",
    "writer",
    "saxophonist",
    "film director",
    "comedian",
    "photojournalist",
    "poet",
    "actress",
    "film",
    "editor",
    "drummer",
    "producer",
    "songwriter",
    "publisher",
    "author",
    "violinist",
    "rapper",
    "musician",
    "animator",
    "filmmaker",
    "pianist",
    "historian",
    "comic",
    "screenwriter",
    "fashion",
    "designer",
    "guitarist",
    "voice",
    "opera",
    "cinematographer",
    "playwright",
    "sculptor",
    "novelist",
    "photographer",
    "architect",
    "painter",
    "artist",
    "disc jockey",
    "dj",
    "DJ",
    "MC",
    "bridge player",
    "tenor",
    "trombonist",
    "filmmaker",
    "ballerina",
    "bassist",
    "film critic",
    "critic",
    "personality",
    "organist",
    "operatic",
    "lyricist",
    "translator",
    "visual artist",
    "soprano",
    "cellist",
    "broadcaster",
    "chef",
    "literary critic",
    "ballet",
    "illustrator",
    "theatre director",
    "trumpeter",
    "presenter",
    "sportscaster",
    "cartoonist",
    "sportswriter",
    "choral",
    "music",
    "arts",
    "dance",
]
sports = [
    "football",
    "footballer",
    "Olympic",
    "skier",
    "hockey",
    "soccer",
    "cricket",
    "soccer",
    "sprinter",
    "equestrian",
    "gymnast",
    "fencer",
    "chess",
    "wrestler",
    "swimmer",
    "basketball",
    "hurler",
    "sailor",
    "rower",
    "rugby",
    "athlete",
    "golfer",
    "boxer",
    "tennis",
    "cyclist",
    "racing",
    "driver",
    "cricketer",
    "baseball",
    "speedway rider",
    "speedway",
    "rider",
    "badminton",
    "sport shooter",
    "runner",
    "umpire",
    "judoka",
    "volleyball",
    "track and field",
    "track",
    "bobsledder",
    "canoer",
    "bodybuilder",
    "skater",
    "curler",
    "Olympic diver",
    "martial artist",
    "racer",
    "handball",
    "ski jumper",
    "racehorse trainer",
    "racecar driver",
    "hurdler",
    "polo",
    "Olympic shooter",
    "weightlifter",
    "Baseball",
    "mountaineer",
    "jockey",
    "Olympic sports shooter",
    "referee",
    "general manager",
    "sports",
    "sport",
    "athletics",
    "athletic",
]
sciences = [
    "engineer",
    "physicist",
    "geologist",
    "psychiatrist",
    "botanist",
    "biologist",
    "anthropologist",
    "astronomer",
    "biochemist",
    "scientist",
    "computer",
    "archaeologist",
    "psychologist",
    "sociologist",
    "physician",
    "chemist",
    "physicist",
    "mathematician",
    "cosmonaut",
    "pediatrician",
    "astronaut",
    "entomologist",
    "cardiologist",
    "doctor",
    "nurse",
    "immunologist",
    "meteorologist",
    "medical researcher",
    "ornithologist",
    "neuroscientist",
    "microbiologist",
    "zoologist",
    "geographer",
    "inventor",
    "geneticist",
    "surgeon",
    "astrophysicist",
    "statistician",
    "sciences",
    "science",
    "mathematics",
    "math",
    "physics",
    "chemistry",
    "biology",
    "epidemiology",
]

business = [
    "executive",
    "businessman",
    "banker",
    "entrepreneur",
    "real estate developer",
    "restaurateur",
    "businesswoman",
    "sports administrator",
    "business",
    "banking",
    "bank",
]
academia_humanities = [
    "scholar",
    "linguist",
    "educator",
    "philosopher",
    "academic",
    "military historian" "historian",
    "educationalist",
    "philologist",
    "librarian",
    "industrialist",
    "professor",
    "musicologist",
    "academia",
    "education",
    "college",
    "university",
    "humanities",
]
law_enf_military_operator = [
    "officer",
    "army",
    "Army",
    "police",
    "admiral",
    "soldier",
    "Air Force",
    "intelligence",
    "major",
    "lieutenant",
    "admiral",
    "fighter pilot",
    "pilot",
    "naval",
    "Navy",
    "aviator",
    "general",
    "CIA",
    "FBI",
    "law enforcement",
    "military",
    "police",
    "Marines",
    "marine",
    "Coast Guard",
]
spiritual = [
    "rabbi",
    "Catholic",
    "priest",
    "Anglican",
    "cardinal",
    "theologian",
    "prelate",
    "Orthodox",
    "Episcopal",
    "bishop",
    "Jesuit",
    "hierarch",
    "Islamic",
    "religious leader",
    "religious",
    "religion",
]
social = ["philanthropist", "socialite", "philanthropy"]
crime = [
    "serial killer",
    "murderer",
    "convicted",
    "terrorist",
    "mobster",
    "criminal",
    "suspect",
    "crime",
    "guilty",
]
event_other = ["Holocaust survivor", "victim", "survivor"]
record = ["supercentenarian", "oldest person", "centarian", "oldest"]
other_species.append("Tree")

#### Observations:
- We have a good start on `known_for_1` values for which to search.  Some other roles that have been observed previously we have added to the list also.
- Note that roles such as sportswriter and sports broadcaster, though associated with sports, are also included in arts, to align with the underlying nature of the work itself.
- Let us combine them into one dictionary, taking care to put arts last to avoid missing values for "martial artist" and to put spiritual before politics_govt_law so that "leader" in politics_govt_law comes after "religious leader" in relgion.  Likewise "general manager" in sports will come before "general" in law_enf_military_operator and "military historian" in academia_humanities will come before "military" in "law_enf_military_operator".
- We will also include an other_species category here, again putting it last so that trainer and breeder in sports, come before racehorse in other_species.
- Then, we can proceed to extract the category to a new column, `known_for_1`.

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "record": set(record),
    "event_other": set(event_other),
    "crime": set(crime),
    "social": set(social),
    "spiritual": set(spiritual),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "other_species": set(other_species),
}

#### Extracting Category to `known_for_1` Column from `info_1`

In [None]:
# Initializing known_for_1 column
df["known_for_1"] = ""

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Observations:
- Once again, the `info_1` column has provided a small sample on which to test our code, which appears to be working.
- We can move on to extracting additional `known_for` values in `info_1` to `known_for_2`.
- Sir Robin Brook is a good example of an individual who would have 3 categories with our approach--business, business, and sports.  So, we will have enough `known_for` columns to extract all values for all entries.  Removing these values has the added benefit of simplifying the next search for `cause_of_death`.

#### Extracting Category to `known_for_2` Column from `info_1`

In [None]:
# Initializing known_for_2 column
df["known_for_2"] = ""

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` Column from `info_1`

In [None]:
# Initializing known_for_2 column
df["known_for_3"] = ""

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

In [None]:
# Checking remaining unique values in info_1
df["info_1"].value_counts()

#### Observations:
- We have extracted all of the `known_for` information present in `info_1`.
- It is time to proceed with extracting the same from `info_2`, the column that should contain the bulk of this feature's values.

#### Extracting Category to `known_for_1` Column from `info_2`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_2` Column from `info_2`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` Column from `info_2`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_4` Column from `info_2`

In [None]:
# Initializing known_for_4 column
df["known_for_4"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_3']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_5` Column from `info_2`

In [None]:
# Initializing known_for_5 column
df["known_for_5"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_3']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!='']

In [None]:
# Initializing known_for_6 column
df["known_for_6"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_3']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

In [None]:
# Checking the number of missing values in known_for_1
print(
    f'There are {len(df[df["known_for_1"] == ""])} missing values in known_for_1 column.'
)

#### Observations:
- We have exhuasted our search of `info_2` using the current version of `known_for_dict` and have close to 10,000 remaining missing values in `known_for_1`.
- Let us examine some of the remaining unique values in `info_2` and ammend our lists and dictionary.

#### Checking Remaining `info_2` Values

In [None]:
# Obtaining values for column and their counts
col_values = df[df["known_for_1"] == ""]["info_2"].value_counts()

# Creating a list for values that occur more than set number of time
roles_list = [index for index in col_values.index if col_values[index] > 2]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

In [None]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

#### Updating Category Lists for `known_for_dict`

In [None]:
# Appending category lists
sciences = sciences + [
    "volcanologist",
    "gerontologist",
    "pollster",
    "genealogist",
    "software developer",
    "video game developer",
    "anaesthetist",
    "geomorphologist",
    "carcinologist",
    "weatherman",
    "aerodynamicist",
    "limnologist",
    "control theorist",
    "plant pathologist",
    "pathologist",
    "medical practitioner",
    "optometrist",
    "neuroendocrinologist",
    "endocrinologist",
    "anesthesiologist",
    "obstetrician",
    "zookeeper",
    "game developer",
    "forester",
    "embryologist",
    "urologist",
    "arachnologist",
    "lichenologist",
    "anatomist",
    "mineralogist",
    "gastroenterologist",
    "sexologist",
    "bacteriologist",
    "gynecologist",
    "horticulturalist",
    "seismologist",
    "parasitologist",
    "neurophysiologist",
    "primatologist",
    "hydrologist",
    "indologist",
    "ethologist",
    "herbalist",
    "econometrician",
    "cryptographer",
    "toxicologist",
    "haematologist",
    "hematologist",
    "plant ecologist",
    "ecologist",
    "ufologist",
    "crystallographer",
    "gynaecologist",
    "climatologist",
    "glaciologist",
    "demographer",
    "dentist",
    "archeologist",
    "ichthyologist",
    "nephrologist",
    "dermatologist",
    "veterinarian",
    "physiologist",
    "horticulturist",
    "cancer researcher",
    "urban planner",
    "nutritionist",
    "pharmac",
    "oncologist",
    "metallurgist",
    "herpetologist",
    "ophthalmologist",
    "palaeontologist",
    "oceanographer",
    "agronomist",
    "paediatrician",
    "mycologist",
    "naturalist",
    "criminologist",
    "epidemiologist",
    "psychotherapist",
    "neurologist",
    "paleontologist",
    "virologist",
    "psychoanalyst",
]
politics_govt_law = politics_govt_law + [
    "justice",
    "anarchist",
    "secretary",
    "partisan",
    "resistance",
    "Resistance",
    "foreign policy",
    "chieftain",
    "communist",
    "Trotskyist",
    "herald",
    "human rights",
    "campaigner",
    "prince",
    "insurgent",
    "detainee",
    "Resistance",
    "revolutionary",
    "elder",
    "Governor",
    "General",
    "Vice",
    "Admiral",
    "peer",
    "landowner",
    "union",
    "sultan",
    "Sultan",
    "Senator",
    "Representative",
    "loyalist",
    "Supreme Court",
    "Justice",
    "Chief Justice",
    "Conservative",
    "conservative",
    "Liberal",
    "liberal",
    "MP",
    "parliamentarian",
    "pariliament",
    "Parliament",
    "colonial",
    "mayor",
    "Mayor",
    "ruler",
    "republican",
    "Republican",
    "Democrat",
    "democrat",
    "bureaucrat",
    "conspiracy theorist",
    "jihadist",
    "whistleblower",
    "prime minister",
    "countess",
    "District",
    "Judge",
    "foreign minister",
    "Foreign Minister",
    "Islamist",
    "peeress",
    "legislator",
    "first lady",
    "First Lady",
    "courtier",
    "senior",
    "monarch",
    "statesman",
    "lobbyist",
    "solicitor",
    "senator",
    "representative",
    "nationalist",
    "protester",
    "noble",
    "prosecutor",
    "magistrate",
    "public official",
    "feminist",
    "dissident",
    "candidate",
    'congress', 
    'administrator'
]
law_enf_military_operator = law_enf_military_operator + [
    "veteran",
    "forester",
    "Navajo code talker",
    "security",
    "fighter",
    "paramilitary",
    "guerrilla",
    "fighter ace",
    "flying ace",
    "firefighter",
    "Medal of Honor",
    "secret agent",
    "codebreaker",
    "Special Operations",
    "warlord",
    "Victoria Cross",
    "mercenary",
    "World War II",
    "colonel",
    "Marine",
    "Secret Service",
    "commander",
    "Air Chief",
    "Marshal",
    "marshal",
    "aviation",
    "airman",
    "spy",
]
sports = sports + [
    "sport",
    "jumper",
    "athletic",
    "shot putter",
    "Olympian",
    "fencing",
    "bandy",
    "Banty",
    "rodeo",
    "rowing",
    "lacrosse",
    "yoga",
    "futsal",
    "heavyweight",
    "Heavyweight",
    "balloonist",
    "racewalker",
    "hurling",
    "biker",
    "scuba",
    "master of the horse",
    "shogi",
    "Football",
    "softball",
    "free diver",
    "greyhound trainer",
    "goalkeeper",
    "mountain",
    "boxing",
    "hunter",
    "angler",
    "aikidoka",
    "aikido",
    "cave diver",
    "alpinist",
    "powerlifter",
    "karate",
    "rowing",
    "coxswain",
    "skater",
    "skating",
    "Go player",
    "orienteer",
    "orienteer",
    "ten pin",
    "karateka",
    "wrestling",
    "announcer",
    "golf",
    "netball",
    "poker",
    "slalom",
    "canoe",
    "pool player",
    "NFL",
    "CFL",
    "CFL",
    "bowl",
    "pole vault",
    "strongman",
    "yachtsman",
    "snowboard",
    "skateboard",
    "archer",
    "climber",
    "swim",
    "squash",
    "climber",
    "shot put",
    "luger",
    "walker",
    "adventurer",
    "diver",
    "surfer",
    "explorer",
    "bullfighter",
    "sprint",
    "pitcher",
]

academia_humanities = academia_humanities + [
    "Esperantist",
    "phonetician",
    "vexillologist",
    "Byzantinist",
    "logician",
    "Turkologist",
    "bioethicist",
    "Mayanist",
    "Hellenist",
    "crossword compiler",
    "cruciverbalist",
    "Hispanist",
    "Arabist",
    "semiotician",
    "Assyriologist",
    "literary theorist",
    "schoolmaster",
    "schoolteacher",
    "intellectual",
    "organizational theorist",
    "information theorist",
    "orientalist",
    "medievalist",
    "classicist",
    "archivist",
    "museum",
    "numismatist",
    "ethnologist",
    "lexicographer",
    "folklorist",
    "philatelist",
    "sinologist",
    "teacher",
    'Egyptologist',
    'Japanologist',
    'Iranologist',
    'Indologist'
    
]
business = business + [
    "retailer",
    "grocer",
    "auctioneer",
    "baker",
    "car dealer",
    "clothier",
    "food manufacturer",
    "manufacturer",
    "real estate",
    "shipowner",
    "company director",
    "distiller",
    "financial",
    "financial",
    "finance",
    "media owner",
    "printer",
    "management consultant",
    "investment manager",
    "vintner",
    "brewer",
    "jeweller",
    "shipping magnate",
    "nightclub owner",
    "bookseller",
    "billionaire",
    "stockbroker",
    "farmer",
    "hotel",
    "accountant",
    "property developer",
    "investor",
    "financier",
    "winemaker",
]
crime = crime + [
    "murder suspect",
    "suspect",
    "concentration camp guard",
    "drug dealer",
    "drug lord",
    "convict",
    "drug trafficker",
    "spree killer",
    "gangster",
    "snooker",
]
spiritual = spiritual + [
    "Presbyterian",
    "spiritual",
    "Zen",
    "Buddhist",
    "monk",
    "ayatollah",
    "Ayatollah",
    "psychic",
    "yogi",
    "Marja",
    "Trappist",
    "Christian",
    "missionary",
    "Benedictine",
    "nun",
    "faith",
    "healer",
    "Methodist",
    "archdeacon",
    "Baptist",
    "cleric",
    "televangelist",
    "clergy",
    "astrolog",
    "evangelist",
    "minister",
    "pastor",
]
arts = arts + [
    "milliner",
    "memoirist",
    "columnist",
    "bluegrass",
    "fiddler",
    "perfumer",
    "performer",
    "acting",
    "organ builder",
    "art patron",
    "TV",
    "reporter",
    "Pulitzer Prize",
    "script",
    "santoor",
    "mandolin",
    "oenologist",
    "radio",
    "host",
    "horn player",
    "cameraman",
    "tuba",
    "surfboard shaper",
    "impresario",
    "weaver",
    "oud player",
    "blues",
    "reporter",
    "animal trainer",
    "harmonica",
    "guitar",
    "movie",
    "woodworker",
    "R&B",
    "antique",
    "craftsman",
    "double bass",
    "keyboard",
    "drag queen",
    "trumpet",
    "hairstylist",
    "etiquette",
    "accordion",
    "radio",
    "mural",
    "Calypso",
    "calypso" "bassoon",
    "animation",
    "correspondent",
    "taekwondo",
    "potter",
    "studio",
    "illusionist",
    "magici",
    "circus",
    "documentar",
    "YouTube",
    "satirist",
    "beauty pageant",
    "baritone",
    "impressionist",
    "performer",
    "stunt",
    "hairdresser",
    "theatre",
    "announcer",
    "flutist",
    "flute",
    "clown",
    "harp" "bass player",
    "blog",
    "vlog",
    "show",
    "ventriloquist",
    "typographer",
    "calligrapher",
    "band manager",
    "tabla",
    "storyteller",
    "arranger",
    "news",
    "curator",
    "violist",
    "printmaker",
    "oboist",
    "sound",
    "beauty queen",
    "literary agent",
    "contralto",
    "ceramicist",
    "vocal",
    "ceramist",
    "banjo",
    "publicist",
    "flautist",
    "harpsichord",
    "decorator",
    "talent",
    "accordionist",
    "casting",
    "stage director",
    "theater",
    "humorist",
    "essayist",
    "biographer",
    "art collector",
    "puppeteer",
    "art dealer",
    "drama",
    "art director",
    "entertainer",
    "percussion",
    "clarinet",
    "director",
    "stage",
    "bandoneon",
    "choir",
    "Choir",
]
social = social + [
    "heir",
    "volunteer",
    "public figure",
    "humanitarian",
    "social worker",
]
event_other = event_other + ["homeless", "student", "teenager", "fan of", "worker"]
record = record + [
    "longevity claimant",
    "record holder",
    "heaviest",
    "tallest",
    "shortest",
    "oldest",
    "youngest",
    "last",
    "first",
    "centenarian",
]
other_species = other_species + ["elephant", "Great Dane", "greyhound", "thoroughbred", 'bred']

#### Updating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "record": set(record),
    "event_other": set(event_other),
    "crime": set(crime),
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "spiritual": set(spiritual),
    "other_species": set(other_species),
}

#### Observations:
- Now we will repeat extracting `known_for` values from `info_2` with the updated dictionary.

#### Extracting Category to `known_for_1` Column from `info_2` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_2` Column from `info_2` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_1']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` Column from `info_2` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_4` Column from `info_2` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_5` Column from `info_2` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df['known_for_2']!= '') & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

In [None]:
len(df[df["known_for_1"] == ""])

In [None]:
df[df["known_for_1"] == ""]["info_2"].value_counts()

In [None]:
df[(df["known_for_1"] == "") & (df["info_2"] == "administrator")]

In [None]:
print("dunzo!")
chime.success()