# Wikipedia Notable Life Expectancies

# [Notebook 5 of : Data Cleaning](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean4_thanak_2022_06_23.ipynb)

## Context

The


## Objective

The

### Data Dictionary

- Feature: Description

## Importing Necessary Libraries

In [None]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [None]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean3.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean3", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

In [None]:
# Checking last 2 rows of the data
df.tail(2)

In [None]:
# Checking a sample of the data
df.sample(5)

### Checking Data Types, Duplicates, and Null Values

In [None]:
# Checking data types and null values
df.info()

#### Observations:
- Due to the diversity of entries for `known_for`, we may need to prioritize entries for the extraction of this feature.
- To do so, we will proceed with typecasting num_references as integer, so we can determine which entries have more references.
- We will also make a copy of the `info_parenth` column, as we will alter that column and the information it contains is no longer present in the original `info` column.

#### Typecasting `num_references` as Integer

In [None]:
# Typecasting num_references as int64
df.loc[:, "num_references"] = df.loc[:, "num_references"].astype("int64")

#### Loading `nation_map` from Pickle File to Dictionary nation_map

In [None]:
# Load the nation_map
with open("nation_map.pkl", "rb") as f:
    nation_map = pickle.load(f)

#### Loading `other_species` list from other_species.csv

In [None]:
# Loading other_species list
other_species_df = pd.read_csv("other_species.csv")
other_species = other_species_df["species"].tolist()
other_species = list(
    set(other_species)
)  # To avoid searching duplicates as we have been adding back to same csv file

### `num_references`

In [None]:
df["num_references"].describe()

In [None]:
# Histplot of num_references
plt.figure(figsize=(15, 5))
sns.histplot(data=df, x="num_references")
plt.show()

#### Observations:
- The distribution of `num_references` is highly right skewed.
- We will verify some verifying of links to make sure the pages are for individuals.

In [None]:
# Example of verifying links (repeated in groups of 100 until num_references <100)
df.sort_values(by="num_references", ascending=False)[:10]

#### Observations:
- Some links point to lists, such as the entry with the maximum `num_references` for Bill Andress.
- Also, there are several entries that point to a list of German World War II medal recipiants.
- We will make a list of these links and change their `num_references` values to 1.

In [None]:
# Creating list of links containing "German" with over 20 references
checklist = [
    index
    for index in df.index
    if "German" in df.loc[index, "info"] and df.loc[index, "num_references"] > 20
]
link_list = df.loc[checklist, "link"].tolist()

In [None]:
# # Code used to go verify links
# print(link_list.pop())

In [None]:
# List of links that point to pages with lists of individuals
link_is_list = [
    "https://en.wikipedia.org/wiki/Bill_Andress",
    "https://en.wikipedia.org/wiki/Joachim_Boosfeld",
    "https://en.wikipedia.org/wiki/Alfred_Eick",
    "https://en.wikipedia.org/wiki/Gerhard_Fischer_(soldier)",
    "https://en.wikipedia.org/wiki/Arthur_B%C3%BCssecke",
    "https://en.wikipedia.org/wiki/Hans-G%C3%BCnther_Lange",
    "https://en.wikipedia.org/wiki/Waldemar_von_Gazen",
    "https://en.wikipedia.org/wiki/Heinz_Angelmaier",
    "https://en.wikipedia.org/wiki/Herbert_Fries",
    "https://en.wikipedia.org/wiki/Friedrich_Carl_(officer)",
    "https://en.wikipedia.org/wiki/Karl-Heinz_Altermann",
    "https://en.wikipedia.org/wiki/Paul_Brasack",
    "https://en.wikipedia.org/wiki/Walther_Gerhold",
    "https://en.wikipedia.org/wiki/Siegfried_Gerstner",
    "https://en.wikipedia.org/wiki/Alois_Schnaubelt",
    "https://en.wikipedia.org/wiki/Karl-Heinz_Wiebe",
    "https://en.wikipedia.org/wiki/Hans-Hermann_Sturm",
    "https://en.wikipedia.org/wiki/Siegfried_Jamrowski",
    "https://en.wikipedia.org/wiki/Franz_Kieslich",
    "https://en.wikipedia.org/wiki/Wilhelm_Lehner",
    "https://en.wikipedia.org/wiki/Fritz_Langanke",
    "https://en.wikipedia.org/wiki/Achim_Wunderlich",
    "https://en.wikipedia.org/wiki/Viktor_Vitali",
    "https://en.wikipedia.org/wiki/Wilhelm_Noller",
    "https://en.wikipedia.org/wiki/Heinrich_Sonne",
    "https://en.wikipedia.org/wiki/Ludwig_Meister",
    "https://en.wikipedia.org/wiki/Oskar_Sch%C3%A4fer",
    "https://en.wikipedia.org/wiki/Karl_Brommann",
    "https://en.wikipedia.org/wiki/Friedrich_Adrario",
    "https://en.wikipedia.org/wiki/Werner_Freiherr_von_Beschwitz",
    "https://en.wikipedia.org/wiki/Hans-Georg_Borck",
    "https://en.wikipedia.org/wiki/Wilhelm_Weidenbr%C3%BCck",
    "https://en.wikipedia.org/wiki/Klaus_Feldt",
    "https://en.wikipedia.org/wiki/Erwin_Bachmann",
    "https://en.wikipedia.org/wiki/List_of_Knight%27s_Cross_of_the_Iron_Cross_recipients_(Bn%E2%80%93Bz)",
    "https://en.wikipedia.org/wiki/List_of_German_supercentenarians#Hermann_Dörnemann",
    "https://en.wikipedia.org/wiki/List_of_Knight%27s_Cross_of_the_Iron_Cross_recipients_(Ba%E2%80%93Bm)",
    "https://en.wikipedia.org/wiki/Family_of_Barack_Obama#Stanley_Armour_Dunham",
]

#### Changing `num_references` to 1 Where Entry Link Points to a List Page

In [None]:
# Creating list of row indexes with links that point to lists
rows_to_treat = [index for index in df.index if df.loc[index, "link"] in link_is_list]

# Treating num_references for these entries
df.loc[rows_to_treat, "num_references"] = 1

# Checking a sample of rows
df.loc[rows_to_treat, :].sample(2)

#### Observations:
- With those entries treated, and the other top entries verified, let us take another look at the summary statistics and distribution of `num_references`.

In [None]:
# Summary statistics of updated num_references
df["num_references"].describe()

In [None]:
# Histplot of updated num_references
plt.figure(figsize=(15, 5))
sns.histplot(data=df, x="num_references")
plt.show()

#### Observations:
- The distribution of `num_references` remains highly right skewed.
- 25% of entries have 2 or fewer references, which may be a good cutoff to search for `known_for` values.
- Let us view the log transformation of the feature, both for the full dataset, and for `num_references` > 2.

In [None]:
# Adding column for log_num_references
df["log_num_references"] = df["num_references"].apply(lambda x: np.log(x + 1))

# Histplot of log_num_references
plt.figure(figsize=(15, 5))
sns.histplot(data=df, x="log_num_references")
plt.show()

In [None]:
# Histplot of log_num_references where num_references > 2
plt.figure(figsize=(15, 5))
sns.histplot(data=df[df["num_references"] > 2], x="log_num_references")
plt.show()

#### Observations:
- The distribution is certainly more normalized with the log transformation, but follows a consistent curve for values > 2.
- For this analysis, we will focus on these more noted entries.
- We can keep the full dataset intact at this point, but rely on these entries to create a dictionary for extracting `known_for` values.

## Extracting `known_for` Data
Our goal will be to have some broader categories into which the specific values will fit.  `known_for` is a diverse feature, in that an individual may be known for a long-term role or roles, a specific event, a relationship with another person who is famous, etc.  So, to some extent we will see what we find and adapt as we go.

Also, we will abandon searching left to right as an individual may fit more than one category, and in no particular order.  For example, Ronald Reagan, is entered as "American actor and politician".  He is most known as the 40th president of The United States, so if we prioritized the first value, he would fit only into the category containing actor.  At the same time, it may have been his acting career that led to his political career.  Both arenas are relevant, so we will aim to capture all categories for an individual.  Later, when there are duplicate categories for an indivual, we can remove the redundant values.

We will take the following approach:
1. create and check a list of unique values in `info_2` that have a minimum number repeated, sufficient to create sets for each category, but not so exhaustive to be time prohibitive to manually enter.
2. using the pop() method, add each role to it's associated category's set, below.
3. combine the sets for each category into one dictionary.
4. search for the values in the dictionary and extract the category key value to a new column `known_for_1`, `known_for_2`, etc.

In [None]:
# Obtaining values for column and their counts
col_values = df[df["num_references"] > 2]["info_2"].value_counts()

# Creating a list for values that occur more than set number of times
roles_list = [index for index in col_values.index if col_values[index] > 20]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

In [None]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

In [None]:
# Creating lists for each category
politics_govt_law = [
    "retired lawyer Justice Department official known for prosecution of alleged war criminals",
    "prosecutor for war criminals",
    "criminal defense",
    "political scientist",
    "politic",
    "princess",
    "prince",
    "lawyer",
    "attorney",
    "judge",
    "Justice",
    "justice",
    "solicitor",
    "jurist",
    "jury",
    "juror",
    "econom",
    "barrister",
    "conservationis",
    "govern",
    "peer",
    "environmentalis",
    "human rights",
    "civil rights",
    "rights",
    "activist",
    "aristocrat",
    "diploma",
    "public servant",
    "federal",
    "royal",
    "consort",
    "civil servant",
    "ambassador",
    "Ambassador",
]

arts = [
    "cartoon cat",
    "gospel",
    "producer",
    "disc jockey",
    "film director",
    "designer",
    "science fiction writer",
    "science fiction author",
    "author",
    "screen",
    "write",
    "bluegrass",
    "musician",
    "drama",
    "novelist",
    "journalist",
    "broadcaster",
    "conductor",
    "composer",
    "rock",
    "guitar",
    "actress",
    "actor",
    "model",
    "poet",
    "playwright",
    "puppet",
    "television director",
    "television",
    "TV",
    "host",
    "sculpt",
    "paint",
    "sing",
    "song",
    "radio",
    "presenter",
    "printmaker",
    "potter",
    "reggae",
    "music producer",
    "art dealer",
    "winemaker",
    "artist",
    "blues",
    "country singer",
    "country music",
    "jazz",
    "piano",
    "pianist",
    "choreograph",
    "translator",
    "film",
    "magic",
    "news",
    "anchor",
    "tenor",
    "comic",
    "book",
    "trombon",
    "voice",
    "critic",
    "bass player",
    "bassist",
    "bassoon",
    "bass guitar",
    "editor",
    "document",
    "opera",
    "tenor",
    "organist",
    "keyboard",
    "manga",
    "playback",
    "folk",
    "balle",
    "personality",
    "model",
    "costume",
    "cellist",
    "cello",
    "lyric",
    "soul",
    "soprano",
    "visual",
    "comed",
    "illustrat",
    "R&B",
    "chef",
    "literary",
    "theatre director",
    "theater director",
    "theatre",
    "theater",
    "saxophon",
    "graphic",
    "character",
    "trumpet",
    "pop ",
    "sportscaster",
    "drum",
    "viol",
    "publish",
    "photo",
    "animat",
    "rapper",
    "cinematograph",
    "cinema",
    "fashion",
    "cartoon",
    "architect",
    "MC",
]
sports = [
    "jockey",
    "martial",
    "sports administrator",
    "sports",
    "basketball",
    "NBA",
    "baseball",
    "MLB",
    "handball",
    "cross country",
    "skier",
    "weightlift",
    "football",
    "gymnast",
    "olymp",
    "paralymp",
    "Paralymp",
    "hockey",
    "speedway",
    "rider",
    "chess",
    "runn",
    "cricket",
    "umpire",
    "track",
    "field",
    "athlet",
    "racecar",
    "driver",
    "wrestl",
    "badminton",
    "bodybuild",
    "racehorse trainer",
    "racing",
    "race car",
    "judo",
    "curl",
    "Baseball",
    "bridge player",
    "long distance",
    "distance",
    "sail",
    "sumo",
    "skate",
    "golf",
    "sprint",
    "tennis",
    "fencer",
    "fencing",
    "referee",
    "motorcycle",
    "rower",
    "rowing",
    "mountain",
    "rugby league",
    "rugby",
    "Rugby",
    "boxer",
    "soccer",
    "swim",
    "hurl",
    "cycl",
    "Olymp",
    "general manager",
]
sciences = [
    "game designer",
    "theoretical physicist",
    "molecular",
    "biolog",
    "doctor",
    "psycholog",
    "paleontolog",
    "neurologist",
    "pathologist",
    "psychoanalys",
    "organic chemist",
    "biochem",
    "chemist",
    "aeronautic",
    "engineer",
    "pharmacolog",
    "geophysi",
    "virolo",
    "neurosurg",
    "entomolog",
    "pediatric",
    "cardiolog",
    "physician",
    "nurs",
    "immunolog",
    "meteorolog",
    "ornitholog",
    "medical",
    "zoolog",
    "neuroscientist",
    "geograph",
    "statistic",
    "inventor",
    "genetic",
    "astrophysic",
    "surgeon",
    "geolog",
    "psychiatr",
    "botan",
    "anthropol",
    "astron",
    "archaeolog",
    "computer scien",
    "psycholog",
    "physicist",
    "mathematic",
]

business = [
    "advertising executive",
    "executive",
    "financier",
    "real estate developer",
    "real estate",
    "mogul",
    "restaurant",
    "businesswoman",
    "businessman",
    "business",
    "trade unionist",
    "unionist",
    "industrialist",
    "banker",
    "banking",
    "entrepreneur",
]
academia_humanities = [
    "academic administrator",
    "legal scholar",
    "literary scholar",
    "educat",
    "academ",
    "schol",
    "historian",
    "teacher",
    "philolog",
    "librar",
    "professor",
    "Professor",
    "musicologist",
    "linguist",
    "sociolog",
    "philosoph",
]
law_enf_military_operator = [
    "police officer",
    "police",
    "military officer",
    "officer",
    "air force",
    "Air Force",
    "Navy",
    "navy",
    "naval",
    "admiral",
    "Admiral",
    "marshal",
    "general",
    "lieutenant",
    "veteran",
    "Veteran",
    "intelligence",
    "Army",
    "army",
    "major",
    "militant",
    "pilot",
    "aviat",
    "informant",
    "soldier",
    "CIA",
    "IRA",
    "Marine",
    "marine",
]
spiritual = [
    "prelate",
    "pastor",
    "Buddh",
    "monk",
    "buddh",
    "Jesuit",
    "priest",
    "Episcopal",
    "Anglican",
    "bishop",
    "Bishop",
    "Islamic scholar",
    "religious leader",
    "Catholic",
    "Baptist",
    "evangel",
    "rabbi",
    "Rabbi",
    "cardinal",
    "theolog",
    "Jewish",
]
social = ["social worker", "socialite", "philanthrop"]
# 'terrorist' will need separate treatment
crime = [
    "war criminal",
    "criminal",
    "murderer",
    "mobster",
    "serial killer",
    "convict",
]
event_record_other = [
    "student",
    "survivor",
    "supercentenarian",
]
other_species = other_species + ["Tree", "cocker spaniel"]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "scientist" in df.loc[index, "info"]], :]

#### Observations:
- We have a good start on `known_for_1` values for which to search.  Some other roles that have been observed previously we have added to the list also.
- Note that roles such as sportswriter and sports broadcaster, though associated with sports, are also included in arts, to align with the underlying nature of the work itself.
- Let us combine them into one dictionary, taking care to put `arts` before `sports`, so that "disc jockey" comes before "jockey".
- We will also include an other_species category here, but we will keep it on the side, for now.
- Then, we can proceed to extract the category to a new column, `known_for_1`.

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict = {
    "social": set(social),
    "spiritual": set(spiritual),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "arts": set(arts),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "crime": set(crime),
    "event_record_other": set(event_record_other),
}

#### Extracting Category to `known_for` Column from `info_1`

In [None]:
# Initializing known_for_1 column
df["known_for_1"] = ""

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Observations:
- Once again, the `info_1` column has provided a small sample on which to test our code, which appears to be working.
- We can move on to extracting additional `known_for` values in `info_1` to `known_for_2`.
- Sir Robin Brook is a good example of an individual who would have 3 categories with our approach--business, business, and sports.  So, we will have enough `known_for` columns to extract all values for all entries.  Removing these values has the added benefit of simplifying the columns if we choose to search for `cause_of_death`.

#### Extracting Category to `known_for_2` Column from `info_1`

In [None]:
# Initializing known_for_2 column
df["known_for_2"] = ""

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_1']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` Column from `info_1`

In [None]:
# Initializing known_for_2 column
df["known_for_3"] = ""

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_2']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

In [None]:
# Checking remaining unique values in info_1
df["info_1"].value_counts()

#### Observations:
- We can update our dictionary and do another iteration on `info_1`.
- For now, we will keep the `other_specis` list off to the side.

In [None]:
# Updating category lists and known_for_dict
arts = arts + ["choral"]
other_species = other_species + ["chimpanzee", "Tree"]
sciences = sciences + ["materials scientist"]
politics_govt_law = politics_govt_law + ["King"]

# Combining separate lists as sets into one dictionary
known_for_dict = {
    "social": set(social),
    "spiritual": set(spiritual),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "arts": set(arts),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "crime": set(crime),
    "event_record_other": set(event_record_other),
}

#### Extracting Category to `known_for_1` Column from `info_1` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_2` Column from `info_1` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_1']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` Column from `info_1` with Updated `known_for_dict`

In [None]:
%%time

# Column to check
column = 'info_1'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_2']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

#### Observations:
- We have extracted all of the `known_for` information present in `info_1`.
- It is time to proceed with extracting the same from the remaining numbered `info_` columns.
- For now, we will wait to include `info_parenth` as the column may contain extraneous information that could generate errors.  Later, for missing `known_for_1` values we can search it.

#### Extracting Category to `known_for_1` Column from `info_2`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
        for role in category_set:
            dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
            for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, extract_to] = category
                        df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_2` Column from `info_2`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_1']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` Column from `info_2`

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_2']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_4` Column from `info_2`

In [None]:
# Initializing known_for_4 column
df["known_for_4"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_3']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_5` Column from `info_2`

In [None]:
# Initializing known_for_5 column
df["known_for_5"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_4']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_6` Column from `info_2`

In [None]:
# Initializing known_for_6 column
df["known_for_6"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_5']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_7` Column from `info_2`

In [None]:
# Initializing known_for_7 column
df["known_for_7"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_7'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_6']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!='']

#### Extracting Category to `known_for_8` Column from `info_2`

In [None]:
# Initializing known_for_8 column
df["known_for_8"] = ""

In [None]:
%%time

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_8'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in known_for_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_7']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

In [None]:
# Checking the number of missing values in known_for_1 where num_references > 2
print(
    f'There are {len(df[(df["known_for_1"] == "") & (df["num_references"]>2)])} missing values in known_for_1 column for entries with more than 2 references.'
)

#### Observations:
- We have exhausted our search using the current version of `known_for_dict` and have ~8,000 remaining missing values in `known_for_1` for the entries with more than 2 references.
- Let us create a second version of our dictionary and search `known_for_2` again.

#### Checking Remaining `info_2` Values

In [None]:
# Obtaining values for column and their counts
col_values = df[(df["known_for_1"] == "") & (df["num_references"] > 2)][
    "info_2"
].value_counts()

# Creating a list for values that occur more than set number of times
roles_list = [index for index in col_values.index if col_values[index] > 2]

In [None]:
# # Using pop to check list items and add to associated dictionary below
# roles_list.pop()

#### Updating Category Lists for `known_for_dict_2`

In [None]:
# Assigning category lists
sciences = [
    "administrator of NASA",
    "scientist administrator",
    "zookeeper administrator",
    "zookeeper",
    "computer hacker",
    "atmospheric scientist",
    "optomet",
    "reporter",
    "Pulitzer Prize",
    "anatom",
    "primatolog",
    "geomorpholog",
    "neuroendocrinolog",
    "game developer",
    "limnolog",
    "carcinolog",
    "urolog",
    "gerontolog",
    "hydrolog",
    "software developer",
    "neurophysiolog",
    "embryolog",
    "information scientist",
    "scientist",
    "parasitolo",
    "mammalog",
    "seismolog",
    "volcanolog",
    "sexolog",
    "bacteriolog",
    "etholog",
    "gastroenterolog",
    "video game developer",
    "gynecolog",
    "arachnolog",
    "lichenolog",
    "computer pioneer",
    "computer",
    "crystallograph",
    "ichthyolog",
    "ufolog",
    "toxico",
    "horticultur",
    "archeolog",
    "ecolog",
    "cytolog",
    "president of Space",
    "Labs president",
    "president of the Royal",
    "president of the AMA",
    "president of the Hort",
    "child psychotherapist",
    "haematolog",
    "hematolog",
    "obstetric",
    "gynaecolog",
    "cryptograph",
    "lexicograph",
    "organizational theorist",
    "information theorist",
    "dentist",
    "climatolog",
    "demograph",
    "administrator first leader of NASA",
    "science administrator",
    "hospital administrator",
    "physiolog",
    "nephrolog",
    "endocrinolog",
    "veterinar",
    "dermatolog",
    "glaciolog",
    "agronom",
    "urban planner",
    "pharmac",
    "naturalist",
    "nutritionist",
    "cancer researcher",
    "oceanograph",
    "oncolog",
    "herpetolog",
    "metallurg",
    "palaeontolog",
    "ophthalmolog",
    "mycolog",
    "paediatric",
    "epidemiolog",
    "criminolog",
    "cosmonaut",
    "psychotherap",
]
politics_govt_law = [
    "military commander",
    "President",
    "computer hacker",
    "elder",
    "herald",
    "landowner",
    "Politic",
    "Member of Parliament",
    "MP",
    "conspiracy theorist",
    "resistance member",
    "parliament",
    "Parliament",
    "communist revolutionary",
    "communist",
    "revolutionary",
    "Communist",
    "Prime Minister of the",
    "Prime Minister",
    "chieftain",
    "first lady",
    "First Lady",
    "foreign minister",
    "loyalist",
    "Arabian prince",
    "prince",
    "detainee",
    "noblewoman",
    "Treasurer",
    "parliamentarian",
    "congresswoman",
    "congressman",
    "congress",
    "Congress",
    "chairwoman",
    "tribal leader",
    "health official",
    "stateswoman",
    "statesman",
    "trans woman pioneer",
    "member of the stolen generations",
    "colonial official",
    "Resistance member",
    "colonial administrator",
    "whistleblower",
    "Conservative",
    "conservative",
    "Democrat",
    "democrat",
    "indigenous leader",
    "legislat",
    "pollster",
    "countess",
    "community leader",
    "statesman",
    "feminis",
    "lobbyist",
    "separatist leader",
    "magistrate",
    "Islamist",
    "commissioner for children",
    "senator",
    "public figure",
    "traditional ruler",
    "courtier",
    "revolution",
    "Governor",
    "Māori leader",
    "nationalist",
    "tribal leader",
    "prosecutor",
    "protester",
    "resistance member",
    "public official",
    "deputy minister",
    "Foreign minister",
    "Cabinet minister",
    "cabinet minister",
    "Prime minister",
    "Privy Council member former minister MP",
    "former interior minister of",
    "deputy former minister",
    "minister for cultural affairs",
    "chief minister",
    "minister Lord of Appeal",
    "interior minister",
    "minister for social welfare",
    "health minister",
    "MP minister",
    "highways minister",
    "minister for Industry",
    "finance minister",
    "Minister of Education",
    "first female minister",
    "minister of labor",
    "nobleman",
    "noble",
    "dissident",
    "prime minister",
]
law_enf_military_operator = [
    "freedom fighter",
    "warlord",
    "flying ace",
    "guerrilla fighter",
    "guerrilla",
    "fighter ace",
    "insurgent",
    "Navajo code talker",
    "firefighter",
    "resistance fighter",
    "rebel commander",
    "recipient of the Victoria Cross",
    "Victoria Cross",
    "servicewoman",
    "woman to serve in the Foreign",
    "seaman",
    "commander",
    "colonel",
    "Colonel",
    "Special Operations Executive agent",
    "Special Operations",
    "special operations",
    "codebreaker",
    "Air Chief Marshal",
    "Marshal",
    "fighter ace",
    "al Qaeda leader",
    "al Qaeda",
    "jihadist",
    "Secret Service",
    "secret service",
    "president of the Fraternal Order",
    "nazi",
    "World War",
    "WWI",
    "fighter ace",
    "Federal Bureau of Investigation",
    "corrections administrator",
    "rebel leader",
    "Medal of Honor recipient",
    "Medal of Honor",
    "military leader",
    "airman",
    "defense minister",
    "defence minister",
    "minister of defence",
    "spy",
]
sports = [
    "squash administrator",
    "softball administrator",
    "Fame as an administrator",
    "darts administrat",
    "sports adminstrat",
    "sport administrat",
    "games administrat",
    "triathlon competitor administrator",
    "drag racer",
    "cave diver",
    "boxing",
    "heavyweight",
    "balloonist",
    "surfer",
    "Football",
    "rodeo",
    "yacht racer",
    "racer",
    "softball",
    "shot put",
    "greyhound trainer",
    "bobsled",
    "polo",
    "volleyball",
    "racehorse owner",
    "explorer",
    "darts",
    "adventure",
    "karate master",
    "karateka",
    "karate",
    "yacht racer",
    "diver",
    "auto racer",
    "racer",
    "triple jumper",
    "pole vault",
    "Go player",
    "boxing",
    "hunter",
    "ice danc",
    "taekwondo practitioner",
    "taekwondo",
    "racewalk",
    "pool player",
    "orienteer",
    "snowboard",
    "NFL",
    "ten pin bowler",
    "bowler",
    "ten pin",
    "bowling",
    "poker",
    "strongman",
    "high jump",
    "hurdles",
    "land speed",
    "land-speed",
    "figure skating",
    "skating",
    "lacrosse",
    "president of the Cin",
    "Chess Union president",
    "president of commun",
    "stock car racer",
    "racer",
    "president of the Western Hock",
    "president of the International Sk",
    "president of FIFA",
    "president of WBC",
    "president of the International Fed",
    "president of the World Sq",
    "horse breeder",
    "dog musher",
    "climber",
    "sport shoot",
    "netball",
    "squash player",
    "equestrian",
    "hurdler",
    "CFL",
    "ski jump",
    "motocross racer",
    "motocross",
    "bullfight",
    "horse trainer",
    "snooker",
]

academia_humanities = [
    "communication theorist",
    "polymath",
    "schoolmaster",
    "Iranologist",
    "indologist",
    "museum founder",
    "museum",
    "Hispanist",
    "Mayanist",
    "museum curator",
    "museum",
    "curator",
    "phonetician",
    "calligraph",
    "university administrator",
    "president of Emory",
    "president of Amherst",
    "president of the Univ",
    "president of Camp",
    "president of Biola",
    "president of Tech",
    "president of the Center for",
    "president of Drexel",
    "university president",
    "president of the National Un",
    "president of Stanford",
    "president of the Kwansei",
    "president of the Chicago",
    "president of Ricks",
    "president of San D",
    "president of Nanchang U",
    "medievalis",
    "orientalist",
    "Indologist",
    "classicist",
    "university administrator",
    "archivist administrator",
    "college administrator",
    "archivist",
    "ethnolog",
    "numismatist",
    "philatelis",
    "Egyptologist",
    "sinolog",
]
business = [
    "shipowner",
    "clothier",
    "distiller",
    "car dealer",
    "vintner",
    "company director",
    "billionaire",
    "retail",
    "shipping magnate",
    "magnate",
    "labor union leader",
    "labor union",
    "brewer",
    "farmer",
    "jeweller",
    "accountant",
    "accounting",
    "president of the United Food",
    "president of the National Mari",
    "president of United St",
    "president of Wal",
    "president of the United Auto ",
    "president of the Atchison",
    "president CEO of WD",
    "president of Pressman",
    "president of the Labour",
    "president of NALC",
    "president of the Amalgam",
    "president of SEIU",
    "president of the AFL",
    "nightclub owner",
    "dog walker",
    "stockbroker",
    "union leader",
    "credit union administrator",
    "hotelier",
    "trade union leader",
    "property developer",
    "labor leader",
    "investor",
    "restaurateur",
]
crime = [
    "terrorist in  Bali bombings",
    "drug dealer",
    "Jewish woman Gestapo",
    "executed for murder in",
    "first woman executed there",
    "waitress who converted to Islam",  # suicide bomber
    "organized crime figure",
    "murder suspect",
    "suspect",
    "concentration camp guard",
    "child molester",
    "gangster",
    "drug trafficker",
    "spree killer",
    "drug lord",
]
spiritual = [
    "President of the Ramak",
    "grand ayatollah",
    "Mormon leader",
    "Mormon",
    "ayatollah",
    "Christian minister",
    "Christian",
    "missionary",
    "christian",
    "imam",
    "Marja",
    "ayatollah",
    "Grand Ayatollah",
    "spiritual guru",
    "Presbyterian minister",
    "Presbyterian",
    "religious figure",
    "religious",
    "spiritual leader",
    "spiritual",
    "Islamic leader",
    "lama",
    "faith healer",
    "Methodist minister",
    "Methodist",
    "beautified catholic",
    "beatif",
    "Beatif",
    "president of Universal",
    "president of the Divine",
    "president of the Lutheran",
    "Fellowship president",
    "president of Ramak",
    "president of the National Counc",
    "president of the Rātana",
    "cleric",
    "Orthodox hierarch",
    "Orthodox",
    "hierarch",
    "clergy",
    "church administrator",
    "archdeacon",
    "deacon",
    "Deacon",
    "nun",
    "astrolog",
    "Islamic",
]
arts = [
    "harmon",
    "movie director",
    "movie",
    "sarod",
    "performer",
    "orchestra leader",
    "orchestra",
    "surfboard shaper",
    "anaesthetist",
    "etiquette expert",
    "investigative reporter",
    "reporter",
    "hairstylist",
    "antiques expert",
    "antique",
    "dance",
    "calypsonian",
    "calypso",
    "Calypso",
    "milliner",
    "essayist",
    "accordion",
    "weaver",
    "gossip columnist",
    "columnist",
    "blog",
    "vlog",
    "stuntwoman",
    "stuntman",
    "tattooed woman",
    "craftswoman",
    "craftsman",
    "artisan",
    "music director",
    "announcer",
    "flutist",
    "flute",
    "beauty pageant contestant",
    "pageant",
    "impressionist",
    "illusionist",
    "illusion",
    "magic",
    "typograph",
    "auctioneer",
    "bandleader",
    "foreign correspondent",
    "public address announcer",
    "announcer",
    "harpist",
    "caricatur",
    "crime reporter",
    "oboist",
    "oboe",
    "YouTuber",
    "hairdresser",
    "storyteller",
    "president of CBS",
    "president of the Motion",
    "president of Archie",
    "president of the Metro",
    "president of Def",
    "president of John",
    "president of the Mormon Tab",
    "tabla player",
    "music promoter",
    "children entertainer",
    "entertainer",
    "set decorator",
    "decorator",
    "contralto",
    "ventriloquist",
    "harpsichord",
    "beauty pageant winner",
    "arts administrator",
    "music administrator",
    "musical administrator",
    "cultural administrator",
    "Academy Awards administrator",
    "baritone",
    "vocal",
    "stunt",
    "beauty queen",
    "stage director",
    "sound mixer",
    "DJ",
    "band manager",
    "ceramist",
    "banjo",
    "arts patron",
    "publicist",
    "flautist",
    "music manager",
    "ceramicist",
    "casting director",
    "talent manager",
    "talent agent",
    "fiddle",
    "humorist",
    "accordion",
    "art director",
    "biograph",
    "clarinet",
    "percussion",
    "art collector",
]
social = [
    "president of the Shafeek",
    "president of Refugees",
    "president of Rotary",
    "child welfare expert",
    "child safety",
    "children advocate",
    "organised rescue of child",
    "humanitarian",
    "natural childbirth campaign",
    "scout leader",
]
event_record_other = [
    "heiress",
    "cyberbullying victim",
    "victim",
    "homeless man",
    "brain dead woman",
    "heaviest woman",
    "heaviest",
    "tallest",
    "shortest",
    "longest",
    "oldest",
    "youngest",
    "obese woman",
    "woman with rare",
    "transgender woman",
    "mauled woman",
    "former comfort woman",
    "last living",
    "longevity claimant",
    "homicide victim",
    "teenager",
    "child beauty queen",
    "centenarian",
    "murder victim",
]
other_species = other_species + [
    "elephant",
    "Great Dane",
    "greyhound",
    "trained thoroughbred racehorse",
    "Thoroughbred racehorse sire",
    "thoroughbred horse",
    "polar bear",
    "trained racehorse",
    "thoroughbred racehorse sire",
    "sheep",
    "presidential pet",
    "gorilla",
    "trained Thoroughbred racehorse",
    "racehorse sire",
    "Thoroughbred racehorse",
    "thoroughbred racehorse",
    "thoroughbred",
    "Thoroughbred",
    "racehorse",
]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df[df["known_for_1"] == ""].index
#         if "woman" in df.loc[index, "info"]
#     ],
#     :,
# ]

#### Observations:
- We will need to do at least one more iteration after this one to address certain terms that are used across categories, such as 'administrator', 'president', or 'minister'.  
- Some specific values are assigned below to avoid incorrect `known_for` values for these entries.

In [None]:
# Assigning Value of 1 for num_references for additional entry that points to list page
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Mbah_Gotho"].index, "num_references"
] = 1

# Updating log_num_references
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Mbah_Gotho"].index,
    "log_num_references",
] = 0

In [None]:
df[df["link"] == "https://en.wikipedia.org/wiki/Mbah_Gotho"]

In [None]:
# Extracting "dog attack" to cause_of_death to avoid other_species inclusion
index = df[df["link"] == "https://en.wikipedia.org/wiki/Death_of_Diane_Whipple"].index
df.loc[index, "cause_of_death"] = "dog attack"
df.loc[index, "info_3"] = None

In [None]:
# Extracting administrator as academia_humanities to known_for_1
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/John_Blackburn_(educator)"
].index
df.loc[index, "known_for_1"] = "academia_humanities"
df.loc[index, "info_2"] = None

In [None]:
# Removing administrator from entry as category will be captured with other info
index = df[df["link"] == "https://en.wikipedia.org/wiki/John_E._Otto"].index
df.loc[index, "info_2"] = None

In [None]:
# Extracting "complications of diverticulitis" to cause_of_death to remove "diver" from value
index = df[df["link"] == "https://en.wikipedia.org/wiki/Emanuel_Steward"].index
df.loc[index, "cause_of_death"] = "complications of diverticulitis"
df.loc[index, "info_3"] = None

In [None]:
# Assigning arts to first known_for value for entry (individual is associated with arts and politics_govt_law)
index = df[df["link"] == "https://en.wikipedia.org/wiki/Pavel_Dost%C3%A1l"].index
df.loc[index, "info_1"] = "arts"

In [None]:
# Removing minister from entry as category will be captured with other info
index = df[df["link"] == "https://en.wikipedia.org/wiki/Jean_Chamant"].index
df.loc[index, "info_2"] = None

In [None]:
# Assigning law_enf_military_operator to entry with woman to serve in Foreign Legion
index = df[df["link"] == "https://en.wikipedia.org/wiki/Susan_Travers"].index
df.loc[index, "known_for_1"] = "law_enf_military_operator"
df.loc[index, "info_2"] = None

#### Creating `known_for_dict_2` Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict_2 = {
    "crime": set(crime),
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "arts": set(arts),
    "spiritual": set(spiritual),
    "politics_govt_law": set(politics_govt_law),
    "event_record_other": set(event_record_other),
}

#### Observations:
- Now we will repeat extracting `known_for` values from `info_2`.

#### Extracting Category to `known_for_1` from `info_2` with `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_2` from `info_2` with  `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_1']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` from `info_2` with `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_2']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_4` with  `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_3']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_5` with  `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_4']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_6` with Updated `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_5']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_7` with Updated `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_7'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_6']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!='']

#### Extracting Category to `known_for_8` with  `known_for_dict_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_2

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_8'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_7']!= '') & (df[column].notna()) & (df[extract_to]=='')]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

#### Checking Remaining Missing Values in `known_for_1` for Entries with > 2 References

In [None]:
# Checking remaining missing values in known_for_1 for entries with more than 2 references
print(
    f'There are {len(df[(df["known_for_1"] == "") & (df["num_references"]>2)])} remaining missing values in column for entries with more than 2 references.'
)

#### Observations:
- We will do one more round of updating the dictionary and searching.
- For this iteration, we will include `other_species` in our search.

#### Creating `known_for_dict_3`

In [None]:
# Obtaining values for column and their counts
col_values = df[(df["known_for_1"] == "") & (df["num_references"] > 2)][
    "info_2"
].value_counts()

# Creating a list for values that occur more than set number of times
roles_list = [index for index in col_values.index if col_values[index] > 1]

# Checking length of list
print(f"We will examine the top {len(roles_list)} unique values in info_2.")

In [None]:
# # Using pop to update known_for_dict
# roles_list.pop()

#### Updating Category Lists for `known_for_dict_3`

In [None]:
# Assigning category lists
sciences = [
    "NASA mission director",
    "wildlife expert",
    "weatherman",
    "cryptologist",
    "herbalist",
    "butler",  # for White House or is politician also
    "number theorist",
    "sex therapist",
    "malacolog",
    "oenolog",
    "genealog",
    "futurist",
    "alternative medicine practitioner",
    "therapist",
    "logician",
    "beekeeper",
    "aerodynamicist",
    "graph theorist",
    "management theorist",
    "cosmolog",
    "health administrator",
    "probabilist",
    "orchidolog",
    "orthodont",
    "sleep researcher",
    "game programmer",
    "pulmonolo",
    "nematolog",
    "biotech",
    "hepatolog",
    "rheumatolog",
]
politics_govt_law = [
    "sultan",
    "Sultan",
    "Senator",  # sports must go before politics_gov_law in dict
    "presidential candidate",
    "monarch",
    "tribal chief",
    "District Judge",
    "Senator",
    "campaigner for assisted suicide",
    "campaigner",
    "Resistance leader",
    "Hitler Youth leader",
    "anarchist",
    "disability advocate",
    "foreign policy expert",
    "foreign policy",
    "emir",
    "Emir",
    "white supremacist",
    "social reformer",
    "defector",
    "member of the  Resistance",
    "presidential aide",
    "bureaucrat",
    "pacifist",
    "prisoner",
    "town planner",
    "baron",
    "Interior Minister",
    "Premier",
    "boy demonstrator",
    "demonstrator",
    "Representative",
    "last heir of the Reuters family",
    "heir to the Sarawakan throne",
    "possible heir of Hastings Banda",
    "republican",
    "director of the Mint ",
    "Radio Peace",
    "director of Office of Tele",
]
law_enf_military_operator = [
    "law enforcement official",
    "secret agent",
    "leader of Al Qaeda",
    "Al Qaeda leader",
    "Al Qaeda",
    "security guard",
    "nuclear strategist",
    "former leader of the Intelligence Service",
    "leader of Anbar Salvation Council",
    "leader of the Anbar Salvation Council",
    "leader of Tigers",
    "senior Hamas leader",
    "leader of the Izz",
    "militia leader",
    "OSS agent leader",
    "Lashkar e Jhangvi leader",
    "co leader of the coup",
    "mercenary leader",
    "mercenary",
    "prisoner of war",  # Must come before politics_govt_law due to prisoner
    "paramedic",
    "paramilitary",
    "Chief of the Air Staff",
    "bodyguard",
    "Mossad agent",
    "bomber crewman",
    "KGB",
    "lawman",
    "navigator",
    "Coast Guard",
    "coast guard",
    "gendarme",
    "rebel",
    "sheriff",
    "Secret Intelligence Service",
    "technical sergeant",
    "sergeant",
    "security official",
    "FBI",
    "Resistance fighter",
    "director of the Coast",
    "in flight director",
]
sports = [
    "Muay  fighter",
    "Muay Thai",
    "muay thai",
    "Orioles Senators",
    "Ottawa Senators",
    "fitness trainer",
    "yachtsman",
    "master of aikido",
    "aikido",
    "BASE jumper",
    "futsal",
    "alpin",
    "lawn bowl",
    "camogie",
    "backgammon",
    "cheerleader",
    "long jump",
    "canoeist",
    "outdoorsman",
    "trap shoot",
    "camogie",
    "qigong master",
    "coxswain",
    "sharpshooter",
    "rejoneador",
    "goalkeeper",
    "billiards",
    "showjumper",
    "shogi player",
    "angler",
    "fisherman",
    "FIFA",
    "alpinist",
    "luger",
    "long jumper",
    "powerlift",
    "race walker",
    "Formula One team technical director",
    "NASCAR technical director",
]

academia_humanities = [
    "Japanolog",
    "Germanist",
    "Assyriolog",
    "museolog",
    "Esperant",
    "Hellenis",
    "Tibetolog",
    "semiotic",
    "bioethic",
    "crossword compiler",
    "museologist",
    "litterateur",
    "paleograph",
    "toponymist",
    "Albanolog",
    "art theorist",
    "lecturer",
    "vexillolog",
    "preservationist",
    "Arabist",
    "Hellenist",
    "intellectual",
    "university director",
    "director of the National Air",
    "director of the Metropolitan Museum",
]
business = [
    "media owner",
    "casino owner",
    "labor negotiator",
    "hedge fund manager",
    "club owner",
    "clothing manufacturer",
    "arms dealer",
    "financial analyst",
    "food manufacturer",
    "union official",
    "grocer",
    "land developer",
    "investment advisor",
    "management consultant",
    "investment guru Warren Buffett",
    "management guru",
    "rancher",
    "investment manager",
    "toy manufacturer",
    "managing director of Uls",
    "director of the Abu",
]
crime = [
    "inmate",
    "mafia gang leader",
    "islamist terrorist group leader",
    "organized crime leader",
    'founder nominal leader of the " Mafia"',
    "leader Holocaust perpetrator",
    "perpetrator",
    "mafia leader",
    "leader of al Gama'a al Islamiyya",
    "leader of Jabhat al Nusra",
    "gang leader",
    "cartel leader",
    "islamist terrorist group leader",
    "Arabian terrorist",
    "fugitive",
    "'Ndrangheta boss",
    "bomber",  # Must come after law_enf_military_operator due to bomber crewman
    "serial burglar",
    "mafia hitman",
    "vigilante",
    "counterfeiter",
    "sex offender",
    "felon",
    "crime figure",
    "crime boss",
    "fraudster",
]
spiritual = [
    "minister",
    "psychic",
    "sect leader",
    "leader of the cult group",
    "leader of the Branch Davidian sect",
    "Sikh leader",
    "leader of the Apostolic United Brethren",
    "LDS leader",
    "young earth creationist leader",
    "leader in the Baháʼí Faith",
    "eader of the Druids",
    "church leader",
    "leader in the LDS Church",
    "LDS Church youth leader",
    "leader in The Church",
    "Sufi leader",
    "Muslim",
    "Sufi",
    "Adventist leader",
    "cult leader",
    "theosophist leader",
    "Saints leader",
    "Faith leader",
    "leader of the cult group",
    "fortune teller",
    "traditional healer",
    "cantor",
    "apostle",
    "preacher",
    "Zen master",
    "Zen",
    "Cardinal",
    "salvationist",
    "yogi",
]
arts = [
    "euphonium",
    "clown",
    "dialect coach",
    "bibliograph",
    "santoor",
    "bagpipe",
    "art gallery owner",
    "lutenist",
    "lute player",
    "timpanist",
    "muse",  # must come after academia_humanities due to museolog
    "circus ringmaster",
    "circus",
    "ringmaster",
    "lithograph",
    "satir",
    "perfume",
    "conga",
    "drag queen",
    "script supervisor",
    "script",
    "horn player",
    "horn ",
    "oud player",
    "cultural leader",
    "Hollywood",
    "arts leader",
    "leader of The Prisonaires",
    "band leader",
    "Māori kapa haka leader",
    "goldsmith",
    "watercolor",
    "tailor",
    "animal trainer",
    "pornographer",
    "uilleann piper",
    "science fiction",
    "game show contestant",
    "woodwork",
    "boogie woogie",
    "stripper",
    "The Wack Pack",
    "festival organizer",
    "sitarist",
    "paranormal investigator",
    "instrumentalist",
    "motivational speaker",
    "art gallerist",
    "concert promoter",
    "celebrity",
    "couturier",
    "music theorist",
    "art conservator",
    "Queen Pictures",
    "Queen Works",
    "tuba",
    "mandolin",
    "BBC guru",
    "writing guru",
    "theatrical agent",
    "B boy",
    "vedette",
    "woodcarver",
    "watchmaker",
    "impresario",
    "Beatles to their",
    "cameraman",
    "memoirist",
    "arts",
    "muralist",
    "music arranger",
    "arranger",
    "stage manager",
]
social = [
    "leader of the Muscular Dystrophy Association",
    "urban homesteading leader",
    "scouting leader",
    "Girl Guides leader",
    "humanist",
    "community worker",
    "aid worker",
    "health advocate",
    "Scouter",
    "filantropist",
    "director of the Peace Corps",
    "director of UNICEF",
]
event_record_other = [
    "woman",
    "migrant child in  custody",
    "prsistent vegetative state patient",
    "obese man",
    "amnesiac",
    "pensioner",
    "employee",
    "schoolboy",
    "intercepted suicide bomber at school",
    "flight attendant",
]
other_species = other_species + [
    "rhinoceros",
    "eventing horse",
    "giraffe",
    "bucking bull",
    "racehorse broodmare",
    "show jumping horse",
    "celebrity cat",
    "panda",
    "dressage horse",
]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df[df["known_for_1"] == ""].index
#         if "director" in df.loc[index, "info"]
#     ],
#     :,
# ]

In [None]:
# Extracting known_for_1 as politics_govt_law for entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Esther_Figueiredo_Ferraz"].index
df.loc[index, "known_for_1"] = "politics_govt_law"
df.loc[index, "info_2"] = None

#### Creating `known_for_dict_3` Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict_3 = {
    "event_record_other": set(event_record_other),
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "business": set(business),
    "sciences": set(sciences),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "arts": set(arts),
    "spiritual": set(spiritual),
    "crime": set(crime),
    "other_species": set(other_species),
}

#### Extracting Category to `known_for_1` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_1'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_2` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_2'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_1'] != '') & (df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_3` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_3'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_2'] != '') & (df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_4` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_4'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_3'] != '') & (df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_5` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_5'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_4'] != '') & (df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_6` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_6'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_5'] != '') & (df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!=''].sample(2)

#### Extracting Category to `known_for_7` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_7'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_6'] != '') & (df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')
df[df[extract_to]!='']

#### Extracting Category to `known_for_8` Column from `info_2` with `known_for_dict_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_3

# Column to check
column = 'info_2'

# Extract to column
extract_to = 'known_for_8'

# For loop to find role in column and extract it as category to extract_to column
for category, category_set in search_dict.items():
    for role in category_set:
        dataframe = df[(df['known_for_7'] != '') & (df[extract_to]=='') & (df[column].notna())]
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, extract_to] = category
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking number of values found and a sample of rows
print(f'There are {len(df[df[extract_to]!=""])} values in extract_to column.')

#### Checking Remaining Missing Values in `known_for_1` for Entries with More than 2 References

In [None]:
# Checking remaining missing values in known_for_1 where num_references > 2
print(
    f'There are {len(df[(df["known_for_1"] == "") & (df["num_references"]>2)])} remaining missing values in column.'
)

In [None]:
# Creating list of top remaining values in `info_2`
df[(df["known_for_1"] == "") & (df['num_references']>2)]["info_2"].value_counts()

In [None]:
print("dunzo!")
chime.success() 
