# Wikipedia Notable Life Expectancies
# [Notebook 6: Data Cleaning Part 5](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean5_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean4.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean4", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98045 rows and 48 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,,,,,ballet designer,director,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,,,,,writer,,,and academic,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98043,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",50.0,,Pakistan,,"2002 2007, since 2018",4.60517,,politician,,,MNA,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
98044,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,,,,,member of the Academy of Engineering,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
56148,26,Robert Austin Larter,", 90, Canadian politician.",https://en.wikipedia.org/wiki/Robert_Austin_Larter,3,2015,December,,90.0,,Canada,,,1.386294,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1
89845,7,Vinod Singh,", 57, Indian politician, Uttar Pradesh MLA , COVID-19.",https://en.wikipedia.org/wiki/Vinod_Singh_(Gonda_politician),4,2021,May,,57.0,,India,,1996 2017,1.609438,,,,,Uttar Pradesh MLA,,,COVID,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1
84677,15,Jole Santelli,", 51, Italian politician, Deputy , cardiac arrest.",https://en.wikipedia.org/wiki/Jole_Santelli,6,2020,October,and President of Calabria since,51.0,,Italy,,2001 2020 and President of Calabria since 2020,1.94591,,,,,Deputy,,,cardiac arrest,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1
16402,16,Richard B. Sewall,", 95, American professor of English and writer.",https://en.wikipedia.org/wiki/Richard_B._Sewall,7,2003,April,,95.0,,United States of America,,,2.079442,,writer,,,,,,,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,0,0,1
85159,9,Israel Horovitz,", 81, American playwright and screenwriter .",https://en.wikipedia.org/wiki/Israel_Horovitz,27,2020,November,", ,",81.0,,United States of America,,", ,",3.332205,,screenwriter,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98045 entries, 0 to 98044
Data columns (total 48 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98045 non-null  object 
 1   name                       98045 non-null  object 
 2   info                       98045 non-null  object 
 3   link                       98045 non-null  object 
 4   num_references             98045 non-null  int64  
 5   year                       98045 non-null  int64  
 6   month                      98045 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98045 non-null  float64
 9   cause_of_death             13 non-null     object 
 10  place_1                    97891 non-null  object 
 11  place_2                    8115 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98045 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.
- We will proceed with the next `info_2` column, `info_2_1`, as `info_2` is the Wikipedia field that contains the majority of `known_for` information.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2_1`

In [6]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "coach" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "headmaster"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [11]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "politician who served as President of the Vermont State Senate",
    "politician Senator from Delaware from to",
    "politician in Valencian Community",
    "former Newfoundland politician",
    "anti communist politician",
    "Conservative politician",
    "Labour Party politician",
    "oppositional politician",
    "nationalist politician",
    "Republican politician",
    "pan Turkic politician",
    "communist politician",
    "post war politician",
    "one time politician",
    "Unionist politician",
    "eventual politician",
    "Marxist politician",
    "Labour politician",
    "Green politician",
    "politician from",
    "nazi politician",
    "East politician",
    "politician in",
    "a politician",
    "politician",
    "speechwriter for President",
    "defence expert",
    "acting president of",
    "anti divorce activist",
    "former head of the WHO AIDS program",
    "head of the dynasty",
    "head of the F D A",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "screenwriter specialising in comedies",
    "the wife of writer Dylan Thomas",
    "television screenplay writer",
    "songwriter for Guns N' Roses",
    "pasacalle singer songwriter",
    "writer in Gujarati language",
    "singer songwriter",
    "television screenwriter",
    "science fiction writer",
    "roll singer songwriter",
    "writer of TV comedies",
    "writer for children",
    "comics screenwriter",
    "radio script writer",
    "short story writer",
    "non fiction writer",
    "Gros Ventre writer",
    "singer songwriter",
    "television writer",
    "comic book writer",
    "screenplay writer",
    "film music writer",
    "soap opera writer",
    "theme songwriter",
    "writer publicist",
    "a mystery writer",
    "children writer",
    "writer on music",
    "cookbook writer",
    "fantasy writer",
    "fashion writer",
    "mystery writer",
    "fiction writer",
    "culture writer",
    "dialect writer",
    "western writer",
    "travel writer",
    "comics writer",
    "comedy writer",
    "horror writer",
    "jingle writer",
    "script writer",
    "screenwriter",
    "scriptwriter",
    "radio writer",
    "crime writer",
    "score writer",
    "dance writer",
    "story writer",
    "prose writer",
    "food writer",
    "film writer",
    "ghostwriter",
    "songwriter",
    "copywriter",
    "vocal coach involved in the Wrong Door Raid",
    "vocal session arranger",
    "Yiddish vocalist",
    "blues vocalist",
    "vocal coach",
    "vocalist",
    "acting coach",
    "track maker",
    "drama",
    "radio dramatist",
    "drama teacher",
    "drama coach",
    "dramaturge",
    "dramatist",
    "voice artist voice of the Flower Pot Men",
    "television voice actor",
    "voice dubbing artist",
    "voice of Baby Huey",
    "voice over artist",
    "voice over actor",
    "a voice teacher",
    "voice actress",
    "voice teacher",
    "voice artist",
    "voice actor",
    "voice coach",
    "former head of the Horticultural Society",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "general manager of the Dallas Cowboys professional football team",
    "former owner of the Cleveland Cavaliers basketball team",
    "radio announcer for the Lakers basketball team since",
    "first head coach of the women national team",
    "principal of the Toyota F racing team",
    "national softball team member",
    "Formula One team principal",
    "baseball team part owner",
    "women basketball teams",
    "national team captain",
    "national team manager",
    "football team owner",
    "baseball team owner",
    "cricket team coach",
    "partial team owner",
    "sports team owner",
    "racing team owner",
    "hockey team owner",
    "race team owner",
    "team manager",
    "team leader",
    "team owner",
    "field team",
    "coach in the National Hockey League",
    "World Hockey Association defenseman",
    "Commonwealth Games athletics coach",
    "college athletics administrator",
    "athletics director",
    "athletics coach",
    "former owner of the Cleveland Cavaliers basketball team",
    "for the Lakers basketball team since",
    "Ohio State University basketball player",
    "high school basketball coach",
    "wheelchair basketball player",
    "college basketball player",
    "women basketball teams",
    "NBL basketball player",
    "basketball executive",
    "basketball",
    "basketball player",
    "basketball coach",
    "general manager of the Dallas Cowboys professional football team",
    "football placekicker with the Dallas Cowboys",
    "chairman of Bradford City football club",
    "professional rugby league footballer",
    "died on the football pitch in Lyon",
    "footballer for Newport County",
    "professional football player",
    "rugby league football player",
    "World Cup football referee",
    "high school football coach",
    "gridiron football player",
    "rugby league footballer",
    "football administrator",
    "football club chairman",
    "football club director",
    "college football coach",
    "World Cup footballer",
    "football team owner",
    "football club owner",
    "football executive",
    "CFL football coach",
    "football chairman",
    "football official",
    "football manager",
    "football referee",
    "football coach",
    "footballer",
    "Pittsburgh Penguins coach",
    "coach for the NFL Giants",
    "NFL player",
    "silver medallist handball player",
    "beach handball coach",
    "handball player",
    "figure skating",
    "Olympic figure skater",
    "figure skating coach",
    "speed skater",
    "international coach",
    "Olympic field hockey player",
    "Olympic hockey player",
    "ice hockey executive",
    "field hockey player",
    "field hockey coach",
    "ice hockey player",
    "hockey team owner",
    "ice hockey coach",
    "hockey player",
    "Hall of Fame coach",
    "cricket administrator",
    "test cricket umpire",
    "cricket team coach",
    "cricket player",
    "cricket umpire",
    "cricketer",
    "Norwich City record goalscorer",
    "goaltending coach",
    "cup winning coach",
    "Paralympic athlete",
    "Paralympic coach",
    'professional wrestler known as "Bad News Brown"',
    "professional wrestling ring",
    "professional wrestling manager",
    "world champion arm wrestler",
    "professional wrestler",
    "wrestling",
    "wrestling manager",
    "Olympic wrestler",
    "wrestling coach",
    "wrestler",
    "assistant coach",
    "executive coach",
    "Victorian coach",
    "baseball player Philadelphia Athletics",
    "baseball official scorer",
    "baseball team part owner",
    "baseball club owner",
    "baseball team owner",
    "baseball executive",
    "baseball player",
    "baseball coach",
    "pitching coach",
    "first woman to swim the Channel in both directions",
    "first Olympic swimming medallist",
    "first woman to swim the Channel",
    "bronze medal winning swimmer",
    "silver medal winning swimmer",
    "Olympic swimmer",
    "swimming coach",
    "swimmer",
    "strength coach",
    "national coach",
    "field athlete who won four gold medals at the Summer Olympics",
    "candidate for National Olympic Committee president",
    "second oldest national Olympic competitor",
    "Olympic silver medal winning pentathlete",
    "president of the Olympic Committee",
    "the nation first Olympic champion",
    "first Olympic swimming medallist",
    "Olympic gold medallist in discus",
    "the first Olympic gold medalist",
    "winner of the first Olympic m",
    "oldest living former Olympian",
    "Olympic long distance runner",
    "Olympic field hockey player",
    "three time Olympic medalist",
    "Olympic gold medal winner",
    "Olympic water polo player",
    "Olympic lightweight boxer",
    "oldest surviving Olympian",
    "Olympic silver medallist",
    "Olympic bronze medalist",
    "Olympic silver medalist",
    "Olympic sports shooter",
    "Olympic Games champion",
    "Olympic gold medalist",
    "Olympic sport shooter",
    "Olympic figure skater",
    "Olympic hockey player",
    "Olympic sharpshooter",
    "Olympic long jumper",
    "Olympic competitor",
    "Olympic ski jumper",
    "Olympic bobsledder",
    "Olympic medallist",
    "Olympic champion",
    "Olympic medalist",
    "Olympic wrestler",
    "Olympic sprinter",
    "Olympic Champion",
    "Olympic official",
    "Olympic swimmer",
    "Olympic shooter",
    "Olympic hurdler",
    "Olympic athlete",
    "Senior Olympian",
    "Olympic fencer",
    "field Olympian",
    "Olympic skier",
    "Olympic coach",
    "Olympiian",
    "Olympian",
    "Olympics",
    "college coach",
    "cycling advocate",
    "professional road bicycle racer",
    "professional cyclist",
    "road racing cyclist",
    "motorcycle racer",
    "cycling",
    "cycling advocate",
    "racing cyclist",
    "cycling coach",
    "road cyclist",
    "cyclist",
    "bronze medalist fencer",
    "fencing trainer",
    "Olympic fencer",
    "fencing master",
    "fencing coach",
    "foil fencer",
    "fencer",
    "tennis player",
    "tennis umpire",
    "tennis coach",
    "sports analyst",
    "sports media",
    "radio sports",
    "television sports",
    "Olympic sports shooter",
    "TF sports",
    "sports administrator",
    "sports team owner",
    "sports club owner",
    "sports executive",
    "sports official",
    "sports director",
    "sports shooter",
    "sports coach",
    "sportswoman",
    "sportsman",
    "underwater diver",
    "diving coach",
    "scuba diver",
    "diver",
    "javelin thrower",
    "discus thrower",
    "weight thrower",
    "rowing coach",
    "rower",
    "field athlete who won four gold medals at the Summer Olympics",
    "former chairman of Sheffield United",
    "field athlete",
    "field coach",
    "outfielder",
    "field team",
    "racetrack operator",
    "track athlete",
    "track coach",
    "first head coach of the women national team",
    "head coach",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "an authority on psychoanalysis",
    "bee breeding authority",
    "technical authority",
    "team member of the Manhattan Project",
    "scientific divulgator",
    "a pioneer in the field of vitro fertilization",
    "pioneer in the field of computer graphics",
    "field biologist",
    "former head of the Psychological Association",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "uniform supplier to the International Ice Hockey Federation",
    "owner of Ellen Tracy sportswear",
    "sportswear pioneer",
    "the one time head of the Gucci fashion house",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "one of the world leading authorities on book conservation",
    "leading authority on Melanesian culture",
    "authority on Francis Drake",
    "authority on alabaster",
    "foremost encyclopedist",
    "debate coach",
    "head teacher",
    "headmaster",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "division commander",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "the acting head of the UGCC",
    "head of the Sikh Dharma in the western hemisphere",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "founder of the Bandidos Motorcycle Club",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "ex wife of former football player O J Simpson",
    "championed by Edward R Murrow",
    "lung patient whose wrong transplant made headlines",
    "made national headlines",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [12]:
# Dropping entry with link that points to event rather than individual's page
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Death_of_Paolo_Gislimberti"
].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [13]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [14]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 44 s
Wall time: 44 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [15]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    94696
2     3245
0       84
3       19
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- Most likely, the number of entries without any category will not update until we search `info_1` columns. 
- The remaining search of `info_2` and above will either result in redundant categorization or adding categories to entries that have at least 1 category already.
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [16]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [17]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [18]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "actress" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [19]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [20]:
# [index for index in df.index if "policy specialist" in df.loc[index, "info"]]

<IPython.core.display.Javascript object>

In [21]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "husband of actress"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [22]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "political campaign manager",
    "film subject",
    "political fundraiser",
    "band chief",
    "mother of singer Carly Simon",
    "open housing activist",
    "n minister of Housing",
    "housing activist",
    "diplomat for the Holy See",
    "diplomatic advisor",
    "diplomatic analyst",
    "diplomat in the",
    "diplomat",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    'author of the book "The Last Victim"',
    "author on",
    "children author; creator of Shrek",
    "author of books on healthy living",
    "author of philosophical fiction",
    "author of The Crime Victim Book",
    "only child author Jack Kerouac",
    "Pulitzer Prize winning author",
    "author on traditional customs",
    "author of books on",
    "author on capital punishment",
    "author of scores for films",
    "author of children books",
    "author of short stories",
    "author on the Southwest",
    "science fiction author",
    "author on horticulture",
    "children book author",
    "cooking books author",
    "dark fantasy author",
    "best selling author",
    "non fiction author",
    "short story author",
    "author on",
    "comic book author",
    "television author",
    "cinematic author",
    "cookbook author",
    "children author",
    "wildlife author",
    "thriller author",
    "fantasy author",
    "crime author",
    "author of",
    "writer on",
    "writer on the",
    "writer",
    "artistic director of the Brecon Jazz Festival",
    "general manager of Montreux Jazz Festival",
    "Manager of Jazz musician Erroll Garner",
    "television production manager",
    "public relations manager",
    "road manager for Outkast",
    "director of pornographic films",
    "film production illustrator",
    "director in Bollywood films",
    "author of scores for films",
    "film producer of musicals",
    "film fight choreographer",
    "film television director",
    "filmmaker",
    "film festival director",
    "documentary filmmaker",
    "aerial film operator",
    "film character actor",
    "underwater filmmaker",
    "film studio manager",
    "silent film actress",
    "film poster artist",
    "film choreographer",
    "composer for film",
    "composer of film",
    "film pioneer",
    "producer in film",
    "film executive",
    "film director",
    "film producer",
    "film composer",
    "film actress",
    "film critic",
    "film scorer",
    "film scores",
    "film actor",
    "film maker",
    "filmmaker",
    "recording engineer",
    "sound engineer",
    "audio engineer",
    "promoter of funding for the arts",
    "musical theater actor",
    "theater director",
    "theater producer",
    "theater manager",
    "theater critic",
    "theater actor",
    "theater owner",
    "musical theatre actress Tony Award",
    "Broadway theatre",
    "avant garde theatre producer",
    "Broadway theatre performer",
    "musical theatre performer",
    "folk theatre artist",
    "theatre director",
    "theatre producer",
    "theatre designer",
    "theatre actress",
    "theatre manager",
    "theatre critic",
    "theatre actor",
    "theatre",
    "reality talent show finalist",
    "talent manager",
    "talent agent",
    "comic spokesman for music trio The Limeliters",
    "Manager of Jazz musician Erroll Garner",
    "pioneer of the musical genres of and",
    "guru of the independent music scene",
    "musical theatre actress Tony Award",
    "composer of electronic music",
    "musical instrument collector",
    "director of music",
    "composer of classical music",
    "inspirational music",
    "traditional music performer",
    "music publishing executive",
    "country & western musician",
    "television music composer",
    "electronic music composer",
    "traditional folk musician",
    "musical theatre performer",
    "film producer of musicals",
    "music industry executive",
    "country music journalist",
    "Baroque music specialist",
    "music business executive",
    "music festival organiser",
    "classical music satirist",
    "musical stage performer",
    "computer music composer",
    "easy listening musician",
    "country music guitarist",
    "television theme music",
    "rock music journalist",
    "roll session musician",
    "musical theater actor",
    "historian of musicals",
    "music promoter",
    "country music singer",
    "house music producer",
    "music video director",
    "spoken word musician",
    "rockabilly musician",
    "music administrator",
    "music group founder",
    "electronic musician",
    "brass band musician",
    "bass music producer",
    "music singer",
    "music news reporter",
    "bluegrass musician",
    "surf rock musician",
    "surf music pioneer",
    "musician of Foghat",
    "pop music arranger",
    "new wave musician",
    "musical performer",
    "clarinet musician",
    "music journalist",
    "musical director",
    "session musician",
    "musical producer",
    "new age musician",
    "country musician",
    "pop music singer",
    "musical composer",
    "calypso musician",
    "musical arranger",
    "music supervisor",
    "music executive",
    "music publisher",
    "studio musician",
    "music collector",
    "music producer",
    "music director",
    "music arranger",
    "blues musician",
    "music promoter",
    "music composer",
    "salsa musician",
    "musical singer",
    "mento musician",
    "disco musician",
    "jazz musician",
    "folk musician",
    "roll musician",
    "rock musician",
    "music manager",
    "funk musician",
    "musician from",
    "music critic",
    "pop musician",
    "R&B musician",
    "music patron",
    "music editor",
    "music agent",
    "swing music",
    "musician",
    "a founder of rock band Molly Hatchet",
    "producer; husband of Angela Lansbury",
    "composer from the band Savage Rose",
    "co founder of the band Space",
    "front man of band Baltimora",
    "founder of the band Bathory",
    "big band tenor saxophonist",
    "husband of Celia Cruz",
    "bluegrass band leader",
    "brass band musician",
    "husband of actress",
    "big band leader",
    "big band singer",
    "Dansband artist",
    "jazz bandleader",
    "bandleader on",
    "band manager",
    "bandy player",
    "one man band",
    "band leader",
    "bandleader",
    "academic fraudster",
    "daughter of actor Jean Louis Trintignant",
    "actor better known as Huracán Ramírez",
    "daughter of actor Marlon Brando",
    "the father of actor Kevin",
    "Emmy Award winning actor",
    "musical theater actor",
    "film character actor",
    "former child actor",
    "daughter of actor",
    "television actor",
    "character actor",
    "art benefactor",
    "theater actor",
    "comedic actor",
    "theatre actor",
    "wife of actor",
    "screen actor",
    "comedy actor",
    "stage actor",
    "comic actor",
    "child actor",
    "film actor",
    "Fox actor",
    "TV actor",
    "television journalism pioneer",
    "country music journalist",
    "investigative journalist",
    "road safety journalist",
    "journalist for the BBC",
    "television journalist",
    "rock music journalist",
    "literary journalist",
    "journalism",
    "journalist murdered",
    "fashion journalist",
    "journalist editor",
    "music journalist",
    "photo journalist",
    "photojournalist",
    "food journalist",
    "journal editor",
    "journalist on",
    "TV journalist",
    "journalist",
    'classical singer known as the "nightingale of Punjab"',
    "swing singer with Benny Goodman",
    "country music singer",
    "advertising creative",
    "music singer",
    "advertising artist",
    "lăutărească singer",
    "soca parang singer",
    "rockabilly singer",
    "kleinkunst singer",
    "classical singer",
    "pop music singer",
    "bel canto singer",
    "playback singer",
    "big band singer",
    "operatic singer",
    "pop rock singer",
    "Schlager singer",
    "the lead singer",
    "country singer",
    "doo wop singer",
    "chamber singer",
    "session singer",
    "mor lam singer",
    "Western singer",
    "cabaret singer",
    "norteño singer",
    "musical singer",
    "reggae singer",
    "ballad singer",
    "Tejano singer",
    "Celtic singer",
    "sevdah singer",
    "blues singer",
    "opera singer",
    "mambo singer",
    "disco singer",
    "soul singer",
    "jazz singer",
    "roll singer",
    "folk singer",
    "rock singer",
    "R&B singer",
    "pop singer",
    "folksinger",
    "singer",
    "arts philanthropist",
    "musical theatre actress Tony Award",
    "Tony Award winning stage actress",
    "Tony Award winning actress",
    "pornographic actress",
    "silent film actress",
    "television actress",
    "husband of actress",
    "aspiring actress",
    "theatre actress",
    "stage actress",
    "movie actress",
    "radio actress",
    "film actress",
    "TV actress",
    "actress",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "coach",
    "general manager of the National Basketball Association Sacramento Kings",
    "secretary general of the Basketball Association of the",
    "former member of the National Basketball Team",
    "racing manager to Queen Elizabeth II",
    "pioneer in thoroughbred horse racing",
    "horse racing",
    "amateur racing driver",
    "racing driver",
    "first manager of Toronto Blue Jays",
    "member of the College Football Hall of Fame",
    "a member of the Pro Football Hall of Fame",
    "member of the Pro Football Hall of Fame",
    "member of Pro Football Hall of Fame",
    "director in the Football League",
    "manager in the Football League",
    "Football World Cup champion",
    "manager in the Negro leagues",
    "minor league manager",
    "rugby league player",
    "league player",
    "test engineer",
    "manager of Burnley",
    "manager gold medal",
    "equipment manager",
    "general manager",
    "hurling manager",
    "then manager",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "communications engineer",
    "environmental engineer",
    "aeronautical engineer",
    "agricultural engineer",
    "engineering",
    "race chassis engineer",
    "electronics engineer",
    "astronautic engineer",
    "electrical engineer",
    "engineering manager",
    "earthquake engineer",
    "structural engineer",
    "automotive engineer",
    "aerospace engineer",
    "aeroplane engineer",
    "chemical engineer",
    "military engineer",
    "computer engineer",
    "aircraft engineer",
    "engineer for NASA",
    "nuclear engineer",
    "railway engineer",
    "systems engineer",
    "harbor engineer",
    "rocket engineer",
    "civil engineer",
    "radar engineer",
    "radio engineer",
    "led MIT CAD project",
    "co designer of NASA Apollo Lunar Module",
    "engineer for NASA",
    "a NASA",
    "NASA executive",
    "NASA",
    "NASA official",
    "NASA manager",
    "chiropractor",
    "pioneer in combatting polio throughusing penicillin",
    "nursing",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "motor company manager",
    "hedge fund pioneer",
    "hedge fund manager",
    "project manager",
    "factory co owner involved in Fisher Price toy recall",
    "residential contractor",
    "contractor",
    "advertising executive",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "textbook author",
    "collection manager",
    "ethnomusicologist",
    "enthomusicologist",
    "musicologist",
    "academies",
    "academic Parkinson disease",
    "ancient history academic",
    "academic administrator",
    "academy founder",
    "an academic teacher",
    "academic teacher",
    "culture academic",
    "academician",
    "an academic",
    "academic",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "Warsaw Uprising insurgent",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "second wife of author L Ron Hubbard",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "charity fundraiser",
    "fundraiser",
    "philanthropist known for his contribution to Central Park",
    "philanthropist;",
    "philanthropist in West New Province",
    "prominent philanthropist",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "private security contractor",
    "factory worker",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "racing trotter",
    "animal actor",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [23]:
# Hard-coding cause_of_death for entry with value in info_2_1
index = df[df["link"] == "https://en.wikipedia.org/wiki/Rutherford_Aris"].index
df.loc[index, "cause_of_death"] = "Parkinson disease"


# Hard-coding cause_of_death for entry with value in info_2_1
index = df[df["link"] == "https://en.wikipedia.org/wiki/Rub%C3%A9n_Espinosa"].index
df.loc[index, "cause_of_death"] = "murdered"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [24]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [25]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 50.2 s
Wall time: 50.2 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [26]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    93605
2     4322
0       82
3       35
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [27]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [28]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [29]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "sex" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [30]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [31]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "sex educator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [32]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "FEMA director",
    "the first black federal prosecutor in history",
    "of the Communist Party of Great",
    "co founder of the Crips turned anti gang activist",
    "environmental activist with Friends of the Earth",
    "political activist; former wife of Charles Vidor",
    "jailed Tiananmen Square democracy activist",
    "peace activist with War Resisters League",
    "anti caste discrimination activist",
    "activist in the peace movement",
    "nuclear disarmament activist",
    "Muslim women rights activist",
    "anti death penalty activist",
    "transgender rights activist",
    "anti nuclear power activist",
    "indigenous affairs activist",
    "alternative energy activist",
    "voter registration activist",
    "disability rights activist",
    "indigenous rights activist",
    "anti pornography activist",
    "pro independence activist",
    "Jewish community activist",
    "early gay rights activist",
    "anti consumerism activist",
    "Rural Solidarity activist",
    "civil liberties activist",
    "prisoner rights activist",
    "veteran affairs activist",
    "ex gay movement activist",
    "consumer rights activist",
    "family planning activist",
    "gender equality activist",
    "anti apartheid activist",
    "anti Apartheid activist",
    "sustainability activist",
    "Māori language activist",
    "pro euthanasia activist",
    "lesbian rights activist",
    "a civil rights activist",
    "social justice activist",
    "anti communism activist",
    "environmental activist",
    "anti abortion activist",
    "animal rights activist",
    "Romani people activist",
    "anti poaching activist",
    "tribal rights activist",
    "Native rights activist",
    "prison reform activist",
    "voting rights activist",
    "Romani rights activist",
    "pro democracy activist",
    "civil rights activist",
    "human rights activist",
    "women rights activist",
    "independence activist",
    "anti nuclear activist",
    "arms control activist",
    "right to die activist",
    "anti Kremlin activist",
    "anti fascist activist",
    "conservative activist",
    "anti alcohol activist",
    "trans rights activist",
    "anti suicide activist",
    "transvestite activist",
    "a Trotskyist activist",
    "political activist in",
    "tuberculosis activist",
    "LGBT rights activist",
    "free speech activist",
    "trade union activist",
    "anti police activist",
    "transgender activist",
    "gay rights activist",
    "anti junta activist",
    "euthanasia activist",
    "resistance activist",
    "gun rights activist",
    "men rights activist",
    "indigenous activist",
    "activist for Jewish",
    "disability activist",
    "Trotskyist activist",
    "political activist",
    "community activist",
    "communist activist",
    "democracy activist",
    "paralysis activist",
    "HIV AIDS activists",
    "nutrition activist",
    "anti war activist",
    "cultural activist",
    "feminist activist",
    "anti gay activist",
    "Buddhist activist",
    "taxpayer activist",
    "maritime activist",
    "pro life activist",
    "activist of birth",
    "peace activist in",
    "intersex activist",
    "internet activist",
    "language activist",
    "student activist",
    "social activists",
    "freedom activist",
    "atheist activist",
    "antiwar activist",
    "social activist",
    "Jewish activist",
    "Romani activist",
    "rights activist",
    "church activist",
    "peace activist",
    "civic activist",
    "labor activist",
    "women activist",
    "trans activist",
    "union activist",
    "LGBTQ activist",
    "LGBT activist",
    "AIDS activist",
    "PZPR activist",
    "anti activist",
    "Roma activist",
    "BDSM activist",
    "gay activist",
    "HIV activist",
    "PEN activist",
    "Neo activist",
    "activist",
    "advocate for education reform",
    "sexual education advocate",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "author",
    "actor",
    'composer known as "Boule Noire"',
    "wife of composer Henry Cowell",
    "composer of radio jingles",
    "composer in classical",
    "television composer",
    "bolero composer",
    "choral composer",
    "composer from",
    "R&B composer",
    "producer of pornographic movies",
    "producer for television",
    "theatrical producer of",
    "entertainment producer",
    "television producer",
    "theatrical producer",
    "producer impresario",
    "R&B record producer",
    "dancehall producer",
    "Broadway producer",
    "ceremony producer",
    "producer for and",
    "record producer",
    "radio producer",
    "movie producer",
    "house producer",
    "media producer",
    "BBC producer",
    "TV producer",
    "producer",
    "Pulitzer Prize winning poet",
    "former poet laureate",
    "existentialist poet",
    "poetry translator",
    "humorist poet of",
    "spoken word poet",
    "surrealist poet",
    "poetry promoter",
    "poetry",
    "poet",
    "founding general director of Michigan Opera Theatre",
    "artistic director of the Melbourne Theatre Company",
    "director of the Willie Clancy Summer School",
    "artistic director of Ballet",
    "television director winner",
    "director of the City Opera",
    "television news director",
    "director of photography",
    "second unit director",
    "television director",
    "director general of",
    "director of dubbing",
    "animation director",
    "newspaper director",
    "assistant director",
    "roll tour director",
    "programme director",
    "executive director",
    "artistic director",
    "festival director",
    "creative director",
    "director of plays",
    "casting director",
    "gallery director",
    "dubbing director",
    "concert director",
    "choral director",
    "ballet director",
    "circus director",
    "stage director",
    "opera director",
    "stunt director",
    "choir director",
    "radio director",
    "media director",
    "video director",
    "movie director",
    "sound director",
    "news director",
    "art director",
    "TV director",
    "a director",
    "comedian Saturday Night Live",
    "stand up comedian",
    "comedian",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "chess composer",
    "director of the Tour de",
    "athletic director",
    "sporting director",
    "race director",
    "second highest scorer in Iowa State University history",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "former director of Oak Ridge National Laboratory",
    "former director of SLAC",
    "director of the Fish",
    "research director",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "businessman best known for rescuing Howard Hughes in plane crash",
    "business support executive",
    "business",
    "business consultant",
    "business executive",
    "business magnate",
    "business leader",
    "business tycoon",
    "businessperson",
    "business owner",
    "businesswoman",
    "business man",
    "businessman",
    "managing director of Lego",
    "corporate director",
    "company director",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "the founding director of the Maritime Museum of the Atlantic",
    "archaeologist who was director of the Museum of",
    "director of the Advanced Placement Program",
    "library director",
    "museum director",
    "historian at Columbia University",
    "an expert on the history of medieval",
    "historian of the Manhattan Project",
    "Pulitzer Prize winning historian",
    "preserver of historic buildings",
    "Himalayan expedition historian",
    "professor emeritus of history",
    "historian of Latin studies",
    "historic preservationist",
    "architectural historian",
    "historian of philosophy",
    "historian of the Nation",
    "historical revisionist",
    "intellectual historian",
    "Osage Nation historian",
    "deaf culture historian",
    "revisionist historian",
    "world fair historian",
    "historian of",
    "Wyatt Earp historian",
    "literary historian",
    "historian of ideas",
    "research historian",
    "printing historian",
    "cultural historian",
    "railroad historian",
    "history professor",
    "ancient historian",
    "kitchen historian",
    "railway historian",
    "social historian",
    "apple historian",
    "psychohistorian",
    "local historian",
    "dress historian",
    "oral historian",
    "art historian",
    "pre historian",
    "prehistorian",
    "historian of",
    "art history",
    "historian",
    "on education topics",
    "advocate for women education",
    "education worker in Eastern",
    "educational administrator",
    "early childhood educator",
    "educational consultant",
    "education proponent",
    "Holocaust educator",
    "education official",
    "educational leader",
    "educationalist",
    "FAMU educator",
    "educationist",
    "educator",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "director of the National Security Agency",
    "director of the Mossad",
    "director of the FBI",
    "of the most decorated women in military history",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "philanthropist",
    "wonen health educator",
    "AIDS educator",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "the oldest person ever documented in history",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [33]:
# Dropping values for info_2_0 == manager as all values already categorized to avoid incorrect additional category
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "manager"], "info_2_1"
] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [34]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [35]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 37.8 s
Wall time: 37.8 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [36]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    91761
2     6127
0       78
3       78
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [37]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [38]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [39]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "executive" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [40]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [41]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "chief executive"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [42]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Native rights lawyer",
    "human rights lawyer",
    "civil rights lawyer",
    "disbarred lawyer",
    "criminal lawyer",
    "trial lawyer",
    "labor lawyer",
    "lawyer",
    "hereditary peer",
    "mentor teacher of Vladimir Putin",
    "son of novelist John Buchan",
    "judge for the Middle District of Alabama",
    "the first female federal judge",
    "administrative law judge",
    "Supreme Court judge",
    "first female judge",
    "High Court judge",
    "federal judge",
    "tribal judge",
    "chief judge",
    "state judge",
    "jurist who was the th Governor of Virginia",
    "jurist",
    "publisher of Hitler",
    "the PLO executive committee",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "composer",
    "director",
    "playwright with cerebral palsy",
    "playwright",
    "editor for United Press International",
    "editor of the Daily Express",
    "editor of science fiction",
    "contributing editor for",
    "founding editor of the",
    "editor of comic books",
    "magazine editor of GQ",
    "editor for DC Comics",
    "pulp magazine editor",
    "founding editor of",
    "television editor",
    "newspaper editor",
    "magazine editor",
    "fashion editor",
    "script editor",
    "editor of the",
    "choral editor",
    "sound editor",
    "news editor",
    "book editor",
    "copy editor",
    "art editor",
    "editor of",
    "editor",
    "illustrator of children books",
    "children books illustrator",
    "children book illustrator",
    "trading card illustrator",
    "book cover illustrator",
    "magazine illustrator",
    "book illustrator",
    "illustrator",
    "brass instrument",
    "television special effects designer",
    "television costume designer",
    "roleplaying game designer",
    "Disney landscape designer",
    "designer of the flag of",
    "Gothic Revival designer",
    "architectural designer",
    "postage stamp designer",
    "environmental designer",
    "album cover designer",
    "bridal wear designer",
    "production designer",
    "instrument designer",
    "furniture designer",
    "character designer",
    "interior designer",
    "banknote designer",
    "typeface designer",
    "of design",
    "graphic designer",
    "fashion designer",
    "costume designer",
    "jewelry designer",
    "wargame designer",
    "makeup designer",
    "glass designer",
    "organ designer",
    "type designer",
    "coin designer",
    "game designer",
    "book designer",
    "set designer",
    "ballet mistress",
    "ballet",
    "ballet dancer",
    "acting",
    "music",
    "piano",
    "dance critic",
    "dancer",
    "dance",
    "Tony Award winning choreographer",
    "stage choreographer",
    "choreographer",
    "founding conductor of Brooklyn Philharmonic Orchestra",
    "conductor of the Nashville Symphony Orchestra",
    "choral conductor",
    "choir conductor",
    "opera conductor",
    "conductor",
    "novelist in the Kannada language",
    "James Bond continuation novelist",
    "mystery novelist",
    "graphic novelist",
    "crime novelist",
    '"one of the nation most accomplished medallic artists"',
    "cover production artist for DC Comics",
    "pioneering manhua artist",
    "butter sculpture artist",
    "representational artist",
    "science fiction artist",
    "visualization artist",
    "stained glass artist",
    "installation artist",
    "language dub artist",
    "performance artist",
    "album cover artist",
    "comic strip artist",
    "psychedelic artist",
    "comic book artist",
    "storyboard artist",
    "minimalist artist",
    "conceptual artist",
    "recording artist",
    "pictorial artist",
    "landscape artist",
    "bluegrass artist",
    "fish skin artist",
    "graffiti artist",
    "national artist",
    "graphic artist",
    "ceramic artist",
    "textile artist",
    "cartoon artist",
    "digital artist",
    "kinetic artist",
    "dubbing artist",
    "make up artist",
    "session artist",
    "fantasy artist",
    "plastic artist",
    "visual artist",
    "comics artist",
    "makeup artist",
    "street artist",
    "poster artist",
    "layout artist",
    "video artist",
    "album artist",
    "light artist",
    "artist model",
    "cover artist",
    "comic artist",
    "pop artist",
    "assemblage sculptor",
    "conceptual sculptor",
    "glass sculptor",
    "bust sculptor",
    "sculptor",
    "surrealist painter",
    "landscape painter",
    "portrait painter",
    "painter",
    "television presenter",
    "Chicago children television entertainer",
    "first television host of the game show",
    "pioneer of public access television",
    "television special effects designer",
    "children television personality",
    "reality television personality",
    "children television presenter",
    "reality television contestant",
    "the early years of television",
    "radio television personality",
    "television costume designer",
    "radio television presenter",
    "television show contestant",
    "cable television executive",
    "television news executive",
    "television game show host",
    "television color analyst",
    "children television host",
    "reality television star",
    "television movie scores",
    "television broadcasting",
    "radio television critic",
    "television personality",
    "television broadcaster",
    "television commentator",
    "television news anchor",
    "a television show host",
    "television presenter",
    "television announcer",
    "television executive",
    "television show host",
    "television performer",
    "television editor",
    "television anchor",
    "television scores",
    "television host",
    "television chef",
    "television set",
    "television",
    "television broadcaster",
    "BBC radio broadcaster",
    "broadcaster known as",
    "radio broadcaster",
    "broadcaster",
    "literary book publisher",
    "publisher of magazine",
    "publisher of from to",
    "numismatic publisher",
    "newspaper publisher",
    "magazine publisher",
    "bass guitarist for The Four Seasons",
    "guitarist of Big Country",
    "rock n' roll guitarist",
    "pedal steel guitarist",
    "blues rock guitarist",
    "classical guitarist",
    "flamenco guitarist",
    "blues guitarist",
    "metal guitarist",
    "swing guitarist",
    "roll guitarist",
    "rock guitarist",
    "jazz guitarist",
    "bass guitarist",
    "soul guitarist",
    "R&B guitarist",
    "a guitarist",
    "guitarist",
    "experimental photographer",
    "Holocaust photographer",
    "wildlife photographer",
    "portrait photographer",
    "magazine photographer",
    "nature photographer",
    "blues photographer",
    "photograph subject",
    "art photographer",
    "photo essayist",
    "photographer",
    "pianist specializing in Bach",
    "concert pianist",
    "swing pianist",
    "jazz pianist",
    "fortepianist",
    "R&B pianist",
    "pianist",
    "production company executive",
    "public relations executive",
    "communications executive",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "pilates teacher",
    "Aikikai teacher",
    "aikido teacher",
    "mixed martial artist",
    "martial artist",
    "line judge",
    "for the Cincinnati Reds",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "who made critical contributions to the development of radar",
    "specialized in turbulence",
    "computer research executive",
    "medical research executive",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "automobile manufacturing executive",
    "manufacturing executive",
    "telecommunications executive",
    "financial sector executive",
    "chief executive of Leyland",
    "pharmaceutical executive",
    "executive of Apple Corps",
    "manufacturing executive",
    "agricultural executive",
    "construction executive",
    "electronics executive",
    "technology executive",
    "healthcare executive",
    "mail order executive",
    "investment executive",
    "theme park executive",
    "automobile executive",
    "corporate executive",
    "insurance executive",
    "beverage executive",
    "catering executive",
    "company executive",
    "finance executive",
    "banking executive",
    "transit executive",
    "airline executive",
    "retail executive",
    "mining executive",
    "energy executive",
    "resort executive",
    "parts executive",
    "sales executive",
    "chief executive",
    "food executive",
    "coal executive",
    "oil executive",
    "gem executive",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "translator of philosophy",
    "translator of literature",
    "literary translator",
    "language translator",
    "Quechua translator",
    "translator",
    "MIT professor",
    "Alexander von Humboldt professor of geography at UCLA",
    "professor of",
    "professor at the Academy of Theatre",
    "professor at the University of Chicago",
    "professor at Brigham Young University",
    "professor at the University of Warsaw",
    "professor at George Mason University",
    "professor at Seton Hall University",
    "professor at the University of",
    "professor of clinical",
    "professor at Columbia University",
    "professor at Stanford University",
    "professor of constitutional",
    "professor of Hebrew Literature",
    "philosophy professor",
    "professor of ancient languages",
    "professor of quantum",
    "professor in",
    "professor at University of",
    "emeritus professor at Yale",
    "professor of Latin studies",
    "A&M University professor",
    "professor of philosohy",
    "professor of literature",
    "communication professor",
    "professor of Egyptology",
    "professor of rhetoric",
    "university professor",
    "philosophy professor",
    "University professor",
    "literature professor",
    "professor of Studies",
    "professor emeritus",
    "literary professor",
    "Emeritus professor",
    "college professor",
    "MIT professor",
    "professor of",
    "a professor",
    "professor",
    "university teacher",
    "school teacher",
    "civics teacher",
    "speech teacher",
    "schoolteacher",
    "an executive at SRI International",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "meditation teacher",
    "metaphysical teacher",
    "spiritual leader of the Republic of from to",
    "spiritual teacher",
    "spiritual leader",
    "spiritualist",
    "religious teacher",
    "Dzogchen teacher",
    "Qira'at teacher",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "non profit executive",
    "charity executive",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = ["Hundreds accused him of sexual abuse the year after his death"]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [43]:
# Hard-coding info_2_1 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Harry_Campion"].index
df.loc[index, "info_2_1"] = "government"

# Hard-coding info_2_1 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Hugh_Clegg_(academic)"].index
df.loc[index, "info_2_1"] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [44]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [45]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 43.8 s
Wall time: 43.8 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [46]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    91227
2     6661
3       80
0       76
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [47]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [48]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [49]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "physic" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [50]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [51]:
# # Example code to check additional category for specific entries
# df.loc[
#     [
#         index
#         for index in df.index
#         if df.loc[index, "info_2_1"] == "judge"
#         and df.loc[index, "politics_govt_law"] == 0
#     ],
#     :,
# ]

<IPython.core.display.Javascript object>

In [52]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "physical fitness advocate"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [53]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "judge",
    "local government executive",
    "political executive",
    "union executive",
    "environmental economics",
    "information economics",
    "supply side economist",
    "free market economist",
    "political economist",
    "economic",
    "economics expert",
    "economist",
    "economics",
    "economic",
    "public health administrator",
    "top official in the rebel government",
    "minister in the Whitlam government",
    "member of the Vichy government",
    "former government minister",
    "local government executive",
    "government policy advisor",
    "government administrator",
    "government whistleblower",
    "local government leader",
    "government official",
    "government minister",
    "government advisor",
    "government adviser",
    "government",
    "a colonial administrator",
    "colonial administrator",
    "scientific administrator",
    "science administrator",
    "medical administrator",
    "public administrator",
    "Conservative life peer",
    "Liberal Democrat peer",
    "female life peer",
    "a life peer",
    "life peer",
    "peer",
    "critic of systems analysis",
    "critic of Vladimir Putin",
    "political critic",
    "social critic",
    "Republican member of the House of Representatives",
    "public leader from the North Eastern state of",
    "the only president of the Republic of Herzeg",
    "former prime minister of the Republic",
    "statesman in the Democratic Republic",
    "Apostolic Nuncio to the Republic of",
    "Vice President of the Republic of",
    "First Lady of the Republic of on",
    "founding father of the Republic",
    "public health administrator",
    "public health whistleblower",
    "public health researcher",
    "public health physician",
    "public relations official",
    "public health campaigner",
    "public health official",
    "public health pioneer",
    "public policy adviser",
    "public administrator",
    "public intellectual",
    "public official",
    "public servant",
    "international civil servant",
    "other civil rights leaders",
    "LGBT civil rights advocate",
    "civil rights advocate",
    "civil rights attorney",
    "civil rights leader",
    "a civil servant",
    "civil servant",
    "civil leader",
    "ecofeminist",
    "feminist",
    "trades union official",
    "trade union leader",
    "medical unionist",
    "union executive",
    "trade unionist",
    "union leader",
    "union worker",
    "unionist",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "novelist",
    "artist",
    "publisher of",
    "publisher",
    "media relations executive",
    "broadcasting executive",
    "live event executive",
    "MGM studio executive",
    "newspaper executive",
    "magazine executive",
    "studio executive",
    "record executive",
    "media executive",
    "radio executive",
    "news executive",
    "opera administrator",
    "arts administrator",
    "sister of supermodel Angela Lindvall",
    "fashion model",
    "spokesmodel",
    "modeler",
    "model",
    "inventor of the plastic pink flamingo",
    "inventor of board game Cluedo",
    "board game inventor",
    "roll entrepreneur",
    "founder of Reader response criticism",
    "critic of Scientology",
    "entertainment critic",
    "architecture critic",
    "restaurant critic",
    "visual art critic",
    "cinema critic for",
    "literary critic",
    "cultural critic",
    "urban  critic",
    "media critic",
    "food critic",
    "jazz critic",
    "wine critic",
    "arts critic",
    "book critic",
    "art critic",
    "public relations specialist",
    "public address announcer",
    "public relations expert",
    "public figure",
    "publicist",
    "Picasso biographer",
    "biographer",
    "architect of perestroika",
    "architectural",
    "restoration architect",
    "architecture",
    "landscape architect",
    "architecture critic",
    "interior architect",
    "garden architect",
    "course architect",
    "architecture",
    "architect",
    "transatlantic commentator",
    "media commentator",
    "radio commentator",
    "food commentator",
    "TV commentator",
    "commentator",
    "printmaker",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "sporting executive",
    "boxing executive",
    "sport executive",
    "disability sport administrator",
    "motorsport administrator",
    "athletic administrator",
    "athletic trainer",
    "soccer administrator",
    "boxing administrator",
    "sport administrator",
    "polo administrator",
    "rugby union international player",
    "rugby union international",
    "rugby union player",
    "rugby union playe",
    "rugby player",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "aerospace executive",
    "co inventor of the laser with Charles Townes",
    "inventor of the Dodrill GMR heart machine",
    "inventor of the first digital computer",
    "co inventor of the nicotine patch",
    "inventor of parenteral nutrition",
    "inventor of the mini roundabout",
    "inventor of the hovercraft",
    "inventor of gaffer tape",
    "engineer",
    "founding physician in chief of Children Hospital",
    'physicist known as "The Voice of JPL"',
    "radiation health physics pioneer",
    "physical fitness advocate",
    "public health physician",
    "mathematical physicist",
    "theoretical physicist",
    "detonation physicist",
    "emergency physician",
    "military physician",
    "particle physicist",
    "nuclear physicist",
    "physical chemist",
    "plasma physicist",
    "family physician",
    "sports physician",
    "astrophysicist",
    "biophysicist",
    "geophysicist",
    "physicist",
    "physician",
    "physics",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "software executive",
    "R&D executive",
    "microcomputer entrepreneur",
    "high tech entrepreneur",
    "financial entrepreneur",
    "internet entrepreneur",
    "mining entrepreneur",
    "IT entrepreneur",
    "public relations consultant",
    "investment banker",
    "mortgage banker",
    "merchant banker",
    "banker",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "teacher",
    "university administrator",
    "cultural administrator",
    "college administrator",
    "philosopher of",
    "philosopher of",
    "philosophy of",
    "humanist philosopher",
    "cultural philosopher",
    "philosopher",
    "philosophy",
    "classical scholar who deciphered Linear B",
    "scholar who specialized in Coptic art",
    "scholar of library science",
    "studies scholar",
    "Dead Sea Scrolls scholar",
    "leading scholar of Mon",
    "Shakespearean scholar",
    "scholar of literature",
    "Shakespeare scholar",
    "literature scholar",
    "classical scholar",
    "literary scholar",
    "scholar of",
    "Napoleon scholar",
    "oriental scholar",
    "Judaica scholar",
    "Tolkien scholar",
    "Saxon scholar",
    "art scholar",
    "scholar",
    "critical theorist",
    "public schools superintendent",
    "architectural conservationist",
    "architecture preservationist",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "first civilian to receive the Intelligence Medal of Merit",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "advocate of liberation theology",
    "Protestant theologian",
    "Christian theologian",
    "theology",
    "lay theologian",
    "ecotheologian",
    "theologian",
    "public speaker",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "public bookcase proponent",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "Holocaust survivor following escape from Sobibór",
    "survivor of the Munich massacre",
    "concentration camp survivor",
    "botched execution survivor",
    "sexual assault survivor",
    "kidnapping survivor",
    "Holocaust survivor",
    "holocaust survivor",
    "ebola survivor",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [54]:
# Hard-coding info_2_1 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/John_Leland_Atwood"].index
df.loc[index, "info_2_1"] = ""

# Hard-coding info_2_1 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Zhu_Yuli"].index
df.loc[index, "info_2_1"] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [55]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [56]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 30.5 s
Wall time: 30.5 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [57]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    90435
2     7436
3       98
0       75
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [58]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [59]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [60]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "instructor" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [61]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [62]:
# # Example code to check additional category for specific entries
# df.loc[
#     [
#         index
#         for index in df.index
#         if df.loc[index, "info_2_1"] == "executive"
#         and df.loc[index, "num_categories"] == 0
#     ],
#     :,
# ]

<IPython.core.display.Javascript object>

In [63]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "media ecologist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [64]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "urban designer",
    "conservationist",
    "presidential military advisor",
    "political scientist",
    "influential patent attorney",
    "district attorney",
    "attorney general",
    "attorney",
    "environmentalist",
    "political science at the University of",
    "political campaign chairwoman",
    "political power broker",
    "political strategist",
    "political campaigner",
    "political consultant",
    "political scientist",
    "political dissident",
    "political candidate",
    "political operative",
    "political prisoner",
    "political reformer",
    "political analyst",
    "political advisor",
    "political adviser",
    "political refugee",
    "political insider",
    "political science",
    "political leader",
    "political figure",
    "political aide",
    "political",
    "ambassador to Arabia",
    "ambassador to and",
    "ambassador to the",
    "ambassador to",
    "ambassador",
    "political science at the University of",
    "prisoner of conscience",
    "political dissident",
    "confined dissident",
    "a dissident",
    "dissident",
    "chief legal counsel to the RNC",
    "legal reformer",
    "legal analyst",
    "legal",
    "mistress of the minister Joseph Goebbels",
    "struggle veteran",
    "doctor of law",
    "aristocrat",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "inventor of the action figure",
    "urban design critic",
    "golf course designer",
    "racing car designer",
    "product designer",
    'lyricist of "Arrivederci Roma"',
    "lyricist in the language",
    "lyricist of",
    "lyricist",
    "essayist with the stage name Buddy Blue",
    "essayist",
    "vaudeville entertainer",
    "nightclub entertainer",
    "children entertainer",
    "entertainment",
    "entertainer",
    "Grammy winning arranger",
    "arranger",
    "comic strip cartoonist",
    "Superman cartoons",
    "cartoonists",
    "cartoonist",
    "weather presenter",
    "radio presenter",
    "news presenter",
    "TV presenter",
    "presenter",
    "columnist for the San Francisco Chronicle",
    "syndicated columnist",
    "newspaper columnist",
    "magazine columnist",
    "literary columnist",
    "tabloid columnist",
    "advice columnist",
    "humor columnist",
    "columnist",
    "Cultural Medallion winner",
    "collector of Beatles memorabilia",
    "fine arts collector",
    "antique collector",
    "record collector",
    "song collector",
    "art collector",
    "social media personality",
    "variety show personality",
    "reality show personality",
    "internet personality",
    "Internet personality",
    "YouTube personality",
    "radio personality",
    "media personality",
    "TV personality",
    "personality",
    "James Bond stunt",
    "stunt performer",
    "movie stuntman",
    "stunt double",
    "stuntperson",
    "stuntwoman",
    "stunt man",
    "stuntman",
    "Chicago blues harmonica player",
    "tin whistle player",
    "flugelhorn player",
    "harmonica player",
    "saxophone player",
    "clarinet player",
    "trombone player",
    "keyboard player",
    "Alghoza player",
    "ukulele player",
    "trumpet player",
    "marimba player",
    "yueqin player",
    "tabla player",
    "banjo player",
    "viola player",
    "kora player",
    "bass player",
    "oud player",
    "science fiction",
    "animal trainer",
    "festival organiser",
    "festival promoter",
    "festival founder",
    "tourism promoter",
    "organist choirmaster Washington National Cathedral",
    "concert organist",
    "blues organist",
    "organist",
    "radio announcer",
    "announcer",
    "blogger",
    "computer animation",
    "animator",
    "media ecologist",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "football player",
    "football",
    "sports imposter",
    "sports",
    "International Cricket Council match referee",
    "NHL referee",
    "referee",
    "assistant racehorse trainer",
    "racehorse trainer",
    "horse trainer",
    "youth trainer",
    "international umpire",
    "umpire",
    "underwater explorer",
    "marine explorer",
    "polar explorer",
    "explorer",
    "mountain climber",
    "mountain runner",
    "mountaineer",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "chief designer of the Ada programming language",
    "designer of golf clubs",
    "industrial designer",
    "automobile designer",
    "jet engine designer",
    "aircraft designer",
    "course designer",
    "engine designer",
    "yacht designer",
    "boat designer",
    "psychologist from the Antilles",
    "pastoral psychologist",
    "parapsychologist",
    "psychologist",
    "mathematical logician",
    "applied mathematician",
    "mathematician",
    "mathematics",
    "computer scientist developed diehard tests",
    'social scientist who coined the term ""',
    "theoretical computer scientist",
    "information research scientist",
    "pioneering computer scientist",
    "pharmaceutical scientist",
    "climate change scientist",
    "environmental scientist",
    "communication scientist",
    "conservation scientist",
    "agricultural scientist",
    "nutritional scientist",
    "planetary scientist",
    "fisheries scientist",
    "cognitive scientist",
    "materials scientist",
    "fuel cell scientist",
    "computer scientist",
    "forensic scientist",
    "research scientist",
    "nuclear scientist",
    "medical scientist",
    "climate scientist",
    "systems scientist",
    "social scientist",
    "rocket scientist",
    "paper scientist",
    "space scientist",
    "color scientist",
    "neuroscientist",
    "food scientist",
    "soil scientist",
    "plant collector",
    "skull collector",
    "computer science researcher",
    "planetary science pioneer",
    "information science",
    "science adviser",
    "popular science",
    "social science",
    "psychotherapist",
    "art therapist",
    "therapist",
    "psychoanalyst",
    "medical aid developer",
    "medical practitioner",
    "medical researcher",
    "medical scientist",
    "medical pioneer",
    "medical doctor",
    "medical",
    "molecular genetics pioneer",
    "cancer genetic researcher",
    "population geneticist",
    "behavioral geneticist",
    "molecular geneticist",
    "plant geneticist",
    "geneticist",
    "botanist",
    "Auschwitz concentration camp doctor during World War II",
    "theoretical ecologist",
    "plant ecologist",
    "gynaecologist",
    "gynecologist",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "entrepreneur",
    "tourism pioneer",
    "wife of industrialist Charles W Engelhard Jr",
    "industrialist merged Studebaker",
    "industrialist",
    "restaurateur",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "former curator at the Smithsonian Institution",
    "curator at the Museum",
    "museum curator",
    "art curator",
    "curator",
    "linguistic anthropologist",
    "linguistics",
    "linguist",
    "polymath",
    "sociologist from",
    "sociologist",
    "sociology",
    "industrial archaeologist",
    "classical archaeologist",
    "archaeologist",
    "cultural anthropologist",
    "social anthropologist",
    "palaeoanthropologist",
    "paleoanthropologist",
    "anthropologist",
    "stamp collector",
    "library science pioneer",
    "book collector",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "military intelligence expert",
    "secessionist Biafran military commander",
    "military commander during World War II",
    "military drill instructor",
    "military junta leader",
    "paramilitary leader",
    "military strategist",
    "military commander",
    "military official",
    "military officer",
    "military veteran",
    "military leader",
    "military figure",
    "paramilitary",
    "recipient of the Medal of Honor for his actions in World War II",
    "Medal of Honor recipient for actions in War",
    "Medal of Honor recipient in World War II",
    "a recipient of the Medal of Honor",
    "recipient of the Medal of Honor",
    "recipient the Medal of Honor",
    "Medal of Freedom recipient",
    "Medal of Honor recipient",
    "Military Medal recipient",
    "George Medal",
    "the last surviving soldier to have taken part in the Christmas truce of",
    "decorated soldier",
    "Wehrmacht soldier",
    "rebel soldier",
    "WWII soldier",
    "soldier",
    "trainer of Violette Szabó",
    "cosmonaut trainer",
    "spy during World War II",
    "spy for the Union",
    "spymaster",
    "Metropolitan police officer",
    "political police commander",
    "former police chief",
    "police informant",
    "police official",
    "police officer",
    "police chief",
    "policeman",
    "Women Airforce Service Pilots veteran",
    "International Brigades veteran",
    "last known World War I veteran",
    "World War II veteran",
    "World War I veteran",
    "Civil War veteran",
    "veterans advocate",
    "military veteran",
    "army veteran",
    "war veteran",
    "WW veteran",
    "World War II army officer",
    "army officer",
    "army general",
    "army veteran",
    "army colonel",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "Southern Baptist minister",
    "Congregational minister",
    "Nonconformist minister",
    "Presbyterian minister",
    "Methodist minister",
    "ordained minister",
    "Baptist minister",
    "baptist minister",
    "missionary",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = ["socialite", "humanitarian"]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "convicted murderer executed by hanging at dawn in Changi Prison for manipulating",
    "convicted accomplice in the Freedom Summer murders",
    "convicted attempted murderer",
    "convicted child sex offender",
    "convicted triple murderer",
    "convicted state terrorist",
    "convicted drug trafficker",
    "convicted child molester",
    "convicted manslaughterer",
    "convicted war criminal",
    "convicted sex offender",
    "convicted child rapist",
    "convicted child abuser",
    "convicted extortionist",
    "convicted conspirator",
    "convicted bank robber",
    "convicted fraudster",
    "convicted embezzler",
    "convicted terrorist",
    "convicted art thief",
    "convicted racketeer",
    "convicted kidnapper",
    "convicted murderer",
    "convicted criminal",
    "convicted con man",
    "convicted rapist",
    "convicted killer",
    "convicted robber",
    "convicted bomber",
    "convicted felon",
    "murder convict",
    "convicted spy",
    "convict",
    "corrupt",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "alleged spy",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = ["champion sire", "sire old age", "active sire", "sire"]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [65]:
# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Frederick_D._Sulcer"].index
df.loc[index, "info_2_1"] = "marketing"

# Hard-coding info_2_1 for entry to correctly categorize
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Matthew_Young_(civil_servant)"
].index
df.loc[index, "info_2_1"] = "CEO Panini Group"

# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Harvey_R._Blau"].index
df.loc[index, "info_2_1"] = "CEO Griffon"

# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Moussa_Benhamadi"].index
df.loc[index, "info_2_1"] = "CEO business"

# Dropping remaining info_2_1 values == 'executive' as redundant to first category
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "executive"],
    "info_2_1",
] = ""

# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Roger_O._Egeberg"].index
df.loc[index, "info_2_1"] = "political"

# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Kenneth_H._Wood"].index
df.loc[index, "info_2_1"] = "religious"

# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Chittaranjan_Mitra"].index
df.loc[index, "info_2_1"] = "university official"

# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Paul_B._Ferrara"].index
df.loc[index, "info_2_1"] = "forensic"

# Hard-coding info_2_1 for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Sandy_D%27Alemberte"].index
df.loc[index, "info_2_1"] = "university official"

# Dropping remaining info_2_1 values == 'executive' as redundant to first category
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "administrator"],
    "info_2_1",
] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [66]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [67]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 45.7 s
Wall time: 45.7 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [68]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    89634
2     8216
3      120
0       74
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [69]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [70]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [71]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "official" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [72]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [73]:
# # Example code to check additional category for specific entries
# df.loc[
#     [
#         index
#         for index in df.index
#         if df.loc[index, "info_2_1"] == "Nobel Prize laureate"
#         and df.loc[index, "num_categories"] == 0
#     ],
#     :,
# ]

<IPython.core.display.Javascript object>

In [74]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "banking official"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [75]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "automotive safety advocate",
    "mental health care advocate",
    "RMS Titanic conspiracy theorist",
    "conspiracy theorist",
    "former prime minister",
    "minister of state",
    "cabinet minister",
    "foreign minister",
    "former minister",
    "deputy minister",
    "prime minister",
    "minister",
    "nonviolence advocate",
    "one of the leaders of the national emancipation movement",
    "a deputy leader of the Party of Regions",
    "leader of the Communist Party of the",
    "a leader of the Communist Party of",
    "opposition leader in Ingushetia",
    "leader of Communist Party of",
    "Corsican nationalism leader",
    "revisionist Zionist leader",
    "co leader of coup d'état",
    "Hmong community leader",
    "Kongu community leader",
    "independentism leader",
    "environmental leader",
    "Alaska Native leader",
    "coup d'état leader",
    "leader of the FNLA",
    "opposition leader",
    "leader of the GDR",
    "communist leader",
    "Communist leader",
    "leader of Group",
    "Islamist leader",
    "first leader of",
    "workers' leader",
    "tribal leader",
    "civic leader",
    "labor leader",
    "Māori leader",
    "coup leader",
    "defector",
    "urban planner",
    "law lord",
    "law",
    "courtier of Queen Elizabeth II",
    "courtier",
    "magistrate",
    "landowner",
    "oil industry whistleblower",
    "Scientology whistleblower",
    "whistleblower",
    "statesman",
    "Holocaust denier",
    "politologist",
    "United Nations disarmament official",
    "official during World War II",
    "State Department official",
    "later East y official",
    "banking official",
    "WHO official",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "caricaturist",
    "Oscar winner",
    "woodcarver",
    "carver",
    "critic",
    "radio talk show host",
    "variety show host",
    "radio show host",
    "game show host",
    "talk show host",
    "ghost hunter",
    "TV show host",
    "radio host",
    "TV host",
    "host of",
    "media theorist",
    "communicator",
    "cinematographer",
    "first woman weathercaster in the",
    "radio newscaster",
    "surf forecaster",
    "crop forecaster",
    "newscaster",
    "podcaster",
    "free jazz violinist",
    "roll violinist",
    "viola da gamba",
    "violinist",
    "violist",
    "wife of leader founder of Ladysmith Black Mambazo",
    "memoirist",
    "DJ of Run DMC",
    "trance DJ",
    "bass DJ",
    "rock DJ",
    "DJ",
    "crime reporter",
    "news reporter",
    "reporter",
    "satirist",
    "substitute anchor for WNBC",
    "news anchor",
    "anchor",
    "viral video performer",
    "nightclub performer",
    "Broadway performer",
    "theremin performer",
    "cabaret performer",
    "performer",
    "jazz bassoonist",
    "jazz bassist",
    "bassoonist",
    "bassist",
    "bass MC",
    "bass DJ",
    "chef",
    "cellist",
    "disc jockey",
    "beauty pageant contestant",
    "beauty pageant winner",
    "beauty queen",
    "escapologist",
    "Screen Actors Guild official",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "the first ever Sidecarcross World Championship",
    "Grand Prix race car driver",
    "ARCA race car driver owner",
    "reincarnation researcher",
    "touring car racer",
    "racecar driver",
    "car builder",
    "Gold Glove winning first baseman One of the first Hispanic players in the",
    "father of San Francisco Giants ballplayer Barry Bonds",
    "professional poker player",
    "water polo player",
    "volleyball player",
    "streetball player",
    "billiards player",
    "lacrosse player",
    "softball player",
    "checkers player",
    "snooker player",
    "croquet player",
    "soccer player",
    "squash player",
    "poker player",
    "chess player",
    "polo player",
    "Go player",
    "player",
    "boxing trainer",
    "trainer",
    "boxing promoter",
    "yoga instructor",
    "adventurer",
    "a member of the MLB Hall of Fame",
    "member of the MLB Hall of Fame",
    "modern pentathlete",
    "masters athlete",
    "pentathlete",
    "decathlete",
    "triathlete",
    "athlete",
    "boxing official",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "inventor",
    "paediatric cardiologist",
    "health care consultant",
    "car constructor",
    "carcinologist",
    "cardiologist",
    "cartographer",
    "scientist",
    "flat Earth conspiracy theorist",
    "control theorist",
    "science",
    "sex researcher at Johns Hopkins University",
    "sudden infant death syndrome researcher",
    "leading researcher into category theory",
    "eye tissue transplant researcher",
    "reproductive medicine researcher",
    "Down syndrome researcher",
    "reincarnation researcher",
    "paranormal researcher",
    "scientific researcher",
    "toxicology researcher",
    "behavioral researcher",
    "stem cell researcher",
    "clinical researcher",
    "HIV AIDS researcher",
    "leukemia researcher",
    "security researcher",
    "diabetes researcher",
    "opinion researcher",
    "polymer researcher",
    "cancer researcher",
    "social researcher",
    "autism researcher",
    "sleep researcher",
    "dream researcher",
    "peace researcher",
    "nurse researcher",
    "AIDS researcher",
    "UFO researcher",
    "pioneer of plant ecology",
    "ecologist",
    "doctor",
    "theoretical chemist",
    "freshwater chemist",
    "agrochemist",
    "biochemist",
    "geochemist",
    "chemistry",
    "chemist",
    "statistician",
    "virologist",
    "psychiatrist",
    "criminologist",
    "renowned plastic surgeon",
    "orthopedic surgeon",
    "plastic surgeon",
    "surgeon",
    "theoretical astronomer",
    "radio astronomer",
    "gastronomist",
    "astronomer",
    "reproductive endocrinologist",
    "comparative physiologist",
    "developmental biologist",
    "paediatric cardiologist",
    "evolutionary biologist",
    "structural geologist",
    "molecular biologist",
    "gastroenterologist",
    "plant physiologist",
    "plant pathologist",
    "neurophysiologist",
    "marine biologist",
    "neuropathologist",
    "cryptozoologist",
    "endocrinologist",
    "myriapodologist",
    "ophthalmologist",
    "geomorphologist",
    "microbiologist",
    "epidemiologist",
    "pharmacologist",
    "paleontologist",
    "neurobiologist",
    "epileptologist",
    "cell biologist",
    "parasitologist",
    "ophthamologist",
    "astrobiologist",
    "glycobiologist",
    "rheumatologist",
    "criminologist",
    "ornithologist",
    "meteorologist",
    "methodologist",
    "lichenologist",
    "carcinologist",
    "arachnologist",
    "pteridologist",
    "haematologist",
    "physiologist",
    "immunologist",
    "cardiologist",
    "toxicologist",
    "entomologist",
    "seismologist",
    "speleologist",
    "hematologist",
    "futurologist",
    "malacologist",
    "a mycologist",
    "mineralogist",
    "enzymologist",
    "pathologist",
    "neurologist",
    "radiologist",
    "osteologist",
    "phycologist",
    "histologist",
    "cosmologist",
    "virologist",
    "oncologist",
    "sexologist",
    "mycologist",
    "rheologist",
    "gemologist",
    "cytologist",
    "bryologist",
    "ecologist",
    "biologist",
    "geologist",
    "zoologist",
    "ufologist",
    "enologist",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "creator of Pernod Ricard",
    "carpet distributor",
    "farmer",
    "billionaire",
    "international financier",
    "a financier",
    "financier",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "principal theorist",
    "consciousness theorist",
    "intelligence theorist",
    "evolution theorist",
    "cultural theorist",
    "literary theorist",
    "design theorist",
    "play theorist",
    "art theorist",
    "World War II researcher",
    "cultural researcher",
    "university lecturer",
    "lecturer",
    "librarian",
    "bibliophile",
    "folklorist of cultures",
    "folklorist",
    "sound archivist",
    "archivist",
    "epistemologist",
    "philologist",
    "ethnologist",
    "lexicographer",
    "official pronouncer of the Scripps National Spelling Bee from to",
    "university official",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "hostage negotiator",
    "spy",
    "flight instructor",
    "test pilot who twice held the world flight altitude record",
    "World War II Tuskegee Airman fighter pilot",
    "the first female combat pilot of the world",
    "an ace nightfighter pilot in World War II",
    "fighter pilot during World War II",
    "World War II fighter pilot",
    "World War II bomber pilot",
    "record setting test pilot",
    "World War II pilot",
    "War fighter pilot",
    "aerobatics pilot",
    "fighter pilot",
    "bomber pilot",
    "glider pilot",
    "USAAF pilot",
    "test pilot",
    "WASP pilot",
    "pilot",
    "Lancaster aircraft",
    "leader of Operation Halyard",
    "Al Shabaab leader",
    "recipient of the Knight Cross of the Iron Cross during World War II",
    "member of the Tuskegee Airmen during World War II",
    "commander of the World War II Tuskegee Airmen",
    "member of the Resistance during World War II",
    "member of the Waffen SS during World War II",
    "World War II Tuskegee Airman fighter pilot",
    "Colditz Castle escapee during World War II",
    "an ace nightfighter pilot in World War II",
    "destroyer commander during World War II",
    "resistance fighter during World War II",
    "squadron commander during World War II",
    "resistance member during World War II",
    "Wehrmacht general during World War II",
    "during World War II",
    "Chetnik commander during World War II",
    "World War II prisoner of war escapee",
    "U boat commander during World War II",
    "prisoner of war during World War II",
    "mine specialist during World War II",
    "anti resisister during World War II",
    "World War II foreign",
    "highly decorated World War II hero",
    "SOE operative during World War II",
    "fighter pilot during World War II",
    "World War II submarine commander",
    "war criminal during World War II",
    "flying ace during World War II",
    "World War II resistance member",
    "World War II resistance worker",
    "SS captain during World War II",
    "SS officer during World War II",
    "navigator during World War II",
    "SEO agent during World War II",
    "World War II partisan fighter",
    "general during World War II",
    "World War II fighter pilot",
    "World War II bomber pilot",
    "World War II codebreaker",
    "World War II fighter ace",
    "World War II flying ace",
    "World War II aviator",
    "World War II pilot",
    "after World War II",
    "World War II hero",
    "naval secretary",
    "naval officer",
    "naval aviator",
    "naval",
    "intelligence official in the Central Intelligence Agency",
    "intelligence official",
    "Pentagon official",
    "Al Qaeda official",
    "security official",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "Catholic cardinal",
    "first cardinal",
    "cardinal",
    "religious leader",
    "Shinshu Buddhist priest",
    "Zen Buddhist priest",
    "Church of priest",
    "Catholic priest",
    "Anglican priest",
    "an priest",
    "priest",
    "gospel",
    "expert on biblical manuscripts",
    "biblical",
    "bible",
    "Orthodox rabbi",
    "Reform rabbi",
    "chief rabbi",
    "rabbi",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "scouting leader",
    "Jewish community leader",
    "Muslim community leader",
    "social worker",
    "pastor of the Worldwide Church of God",
    "megachurch pastor",
    "Baptist pastor",
    "pastor",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "carjacker",
    "hostage taker",
    "rapist",
    "father murderer of Marvin Gaye",
    "murderer of Gianni Versace",
    "mass murderer",
    "high ranking leader of the Sinaloa Cartel",
    "self confessed serial killer",
    "suspected serial killer",
    "serial killer",
    "spree killer",
    "killer",
    "fraudster",
    "war criminal during World War II",
    "gangland criminal",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "Islamic jihad hostage",
    "hostage killed in",
    "ISIS hostage",
    "AQAP hostage",
    "suspected murderer of Olof Palme",
    "suspected murderer",
    "suspected murderer of rapper Tupac Shakur",
    'later pardoned of being World War II propagandist "Tokyo Rose"',
    "murder victim whose killing was documented in the movie: Dear Zachary",
    "anthrax attack victim",
    "killing spree victim",
    "vilification victim",
    "kidnapping victim",
    "Unabomber victim",
    "victims advocate",
    "assault victim",
    "murder victim",
    "kidnap victim",
    "hazing victim",
    "polio victim",
    "ETA victim",
    "neo victim",
    "victim",
    "alleged war criminal",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [76]:
# Hard-coding info_2_1 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/William_Salice"].index
df.loc[index, "info_2_1"] = "arts"

# Dropping info_2_1 value for entries where value is redundant for category
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "promoter"], "info_2_1"
] = ""

# Hard-coding info_2_1 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Canaan_Banana"].index
df.loc[index, "info_2_1"] = "religous"

# Hard-coding info_2_1 value for entry to correctly categorize
index = df[df["link"] == "https://en.wikipedia.org/wiki/Bobbi_Jean_Baker"].index
df.loc[index, "info_2_1"] = "religous"

# Dropping info_2_1 value for entries where value is redundant for category
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "scout"], "info_2_1"
] = ""

# Dropping info_2_1 value for entries where value is redundant for category
df.loc[
    [
        index
        for index in df.index
        if df.loc[index, "info_2_1"] == "Nobel Prize laureate"
    ],
    "info_2_1",
] = ""

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [77]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [78]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 53.1 s
Wall time: 53.1 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [79]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    89132
2     8707
3      134
0       71
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [80]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [81]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [82]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "militant" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [83]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [84]:
# # Example code to check additional category for specific entries
# df.loc[
#     [
#         index
#         for index in df.index
#         if df.loc[index, "info_2_1"] == "official"
#         and df.loc[index, "num_categories"] == 0
#     ],
#     :,
# ]

<IPython.core.display.Javascript object>

In [85]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "murdered by militants in the West Bank city of Ramallah"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [86]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "former Conservative Party Chancellor of the Exchequer",
    "presidential candidate for the Libertarian Party",
    "member of the Communist Party of",
    "chairman of the Communist Party",
    "the Communist Party of Great",
    "partner of nationalist Subhas Chandra Bose",
    "partner in Jacoby & Meyers",
    "parliamentarian for the Peronist party",
    "party functionary",
    "fourth Governor general of",
    "th Prime Minister of land",
    "Deputy Prime Minister of",
    "the th Prime Minister of",
    "first Prime Minister of",
    "former Prime Minister",
    "Prime Minister of",
    "Prime Minister",
    "barrister",
    "anti Castro lobbyist",
    "labor lobbyist",
    "lobbyist",
    "advocate for women reproductive rights",
    "women reproductive health advocate",
    "advocate of freedom of the press",
    "disabled people rights advocate",
    "intellectual freedom advocate",
    "reproductive rights advocate",
    "anti death penalty advocate",
    "indigenous rights advocate",
    "assisted suicide advocate",
    "patients' rights advocate",
    "abortion rights advocate",
    "patient rights advocate",
    "tribal rights advocate",
    "death penalty advocate",
    "animal rights advocate",
    "women rights advocate",
    "human rights advocate",
    "anti torture advocate",
    "independence advocate",
    "anti tobacco advocate",
    "gun control advocate",
    "deaf rights advocate",
    "free speech advocate",
    "sex worker advocate",
    "euthanasia advocate",
    "gay rights advocate",
    "Men rights advocate",
    "marijuana advocate",
    "peace advocate",
    "LGBT advocate",
    "Vice President of the National Congress",
    "Presidential candidate George McGovern",
    "wife of Vice President Hubert Humphrey",
    "spokesman for President Richard Nixon",
    "Socialist candidate for President in",
    "President of the United Auto Workers",
    "member of Presidential commissions",
    "Vice President of the Commission",
    "President of during The Junta",
    "President Georges Pompidou",
    "fifth President of from to",
    "four term President of",
    "President of istan",
    "vice President of",
    "laureate of the Nobel Memorial Prize in Economic Sciences",
    "revolutionary",
    "solicitor",
    "scientific adviser",
    "prosecutor at the Nuremberg war crimes trials",
    "prosecutor",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "designer",
    "proponent of the Hollow Earth concept",
    "Pulitzer Prize winning foreign correspondent",
    "award winning foreign correspondent",
    "Army art correspondent",
    "foreign correspondent",
    "news correspondent",
    "correspondent",
    "radio innovator",
    "radio analyst",
    "engraver",
    "club owner",
    "winner of the Pulitzer Prize",
    "Pulitzer prize winning",
    "Pulitzer Prize winner",
    "Pulitzer prize winner",
    "magician",
    "clarinetist",
    "studio potter",
    "potter",
    "humorist",
    "choirmaster",
    "film",
    "rapper",
    "flautist",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "caster",
    "last living participant in the first F World Championship race",
    "limited partner for the Yankees",
    "general secretary of FIFA",
    "a former owner of the Toronto Maple Leafs",
    "part owner of the Atlanta Braves",
    "owner of Thoroughbred racehorses",
    "thoroughbred racehorse owner",
    "Thoroughbred racehorse owner",
    "co owner of the New Nets",
    "owner of Coolmore Stud",
    "sport franchise owner",
    "billiard hall owner",
    "Rangers part owner",
    "horse stable owner",
    "owner of the Jets",
    "racehorse owner",
    "former President of the ICSD",
    "middle distance runner",
    "long distance runner",
    "triathlon runner",
    "marathon runner",
    "runner",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "car",
    "automotive",
    "expert on the geology of Earth",
    "computer programmer",
    "programmer",
    "recipient of the Nobel Prize in Physiology or Medicine",
    "recipient of the Nobel Prize in Chemistry",
    "recipient of the Nobel Prize in Physics",
    "recipient of the Nobel Prize in Physiology or Medicine",
    "Nobel Prize in Physiology or Medicine laureate",
    "recipient of the Nobel Prize in Chemistry",
    "a winner of the Nobel Prize in Physics in",
    "co winner of Nobel Prize in Chemistry in",
    "recipient of the Nobel Prize in Physics",
    "co winner of Nobel Prize in Physics in",
    "winner of the Nobel Prize in Physics",
    "Nobel Prize for Chemistry laureate",
    "Nobel Prize Laureate",
    "a pioneer in artificial intelligence",
    "pioneer in artificial intelligence",
    "artificial intelligence pioneer",
    "demographer",
    "logician in the analytic tradition",
    "scientific expert",
    "scientific",
    "naturalist",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "fitness centre owner",
    "recipient of the World Food Prize",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "theorist in Arte Povera movement",
    "theorist",
    "researcher",
    "instructor",
    "Yiddish language preservationist",
    "art preservationist",
    "preservationist",
    "sinologist",
    "philatelist",
    "intellectual",
    "pedagogist",
    "pedagogue",
    "ethnographer",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "military",
    "recipient of the Navy Cross",
    "Navy SEAL sniper",
    "Navy",
    "Navy officer",
    "lieutenant general in the Marine Corps",
    "Illinois inspector general",
    "Air Force major general",
    "a general of the Army",
    "lieutenant general",
    "brigadier general",
    "a brigade general",
    "airforce general",
    "Colonel general",
    "major general",
    "ARVN general",
    "WW II flying ace",
    "WWII flying ace",
    "flying ace",
    "Hero of the Union for his actions during WWII",
    "squadron navigator during WWII",
    "gunner during WW II",
    "WWII who claimed to have been the inspiration for",
    "RAF radio operator",
    "radio operator",
    "recipient of the Knight Cross of the Iron Cross",
    "Distinguished Flying Cross recipient",
    "recipient of the Victoria Cross",
    "recipient of nine purple hearts",
    "recipient of the George Cross",
    "recipient of the Silver Star",
    "recipient of the Navy Cross",
    "Legion of honour recipient",
    "Legion of Honour recipient",
    "Military Cross recipient",
    "Victoria Cross recipient",
    "George Cross recipient",
    "Iron Cross recipient",
    "recipient of the Param Vir Chakra",
    "recipient of the Maha Vir Chakra",
    "the first chief of intelligence agency",
    "Chief of intelligence of South",
    "intelligence sergeant major",
    "MI intelligence officer",
    "intelligence specialist",
    "intelligence officer",
    "intelligence analyst",
    "intelligence expert",
    "intelligence chief",
    "astronaut candidate",
    "astronaut",
    "Hero of the Union for his actions during WWII",
    "double Hero of the Union",
    "Hero of the Union",
    "National Hero of",
    "Lieutenant General of the People Army of",
    "member of the National Liberation Army",
    "leader of the Boricua Popular Army",
    "Women Army Corps officer",
    "Army art correspondent",
    "officer in People Army",
    "commander of the Army",
    "a general of the Army",
    "Army first Sergeant",
    "chief of Army Staff",
    "anti fascist militant",
    "Montoneros militant",
    "Islamist militant",
    "far left militant",
    "lmilitant leader",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "young Earth creationist",
    "leader of religious movement The Family International",
    "Catholic lay leader",
    "Christian leader",
    "church leader",
    "Muslim leader",
    "sect leader",
    "chaplain",
    "general superintendent",
    "Christian evangelist",
    "evangelist",
    "Islamic preacher",
    "preacher",
    "motivational speaker",
    "UNICEF Committee President",
    "Salvation Army officer",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    "advocate for people with disabilities",
    "Order of recipient",
    "Nobel Peace Prize nominee",
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    'nominal leader of the " Mafia"',
    "Sinaloa Cartel leader",
    "war criminal",
    "criminal",
    "sex offender known as the",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "alleged child sex offender",
    "supercentenarian",
    "centenarian",
    "murdered by militants in the West Bank city of Ramallah",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [87]:
# Hard-coding info_2_1 value to correctly categorize entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Archie_Butterworth"].index
df.loc[index, "info_2_1"] = "scientific"

# Hard-coding info_2_1 value to correctly categorize entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Carroll_Shelby"].index
df.loc[index, "info_2_1"] = "scientific"

# Hard-coding info_2_1 value to correctly categorize entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Vasily_Bakalov"].index
df.loc[index, "info_2_1"] = "scientific"

# Dropping info_2_1 = 'researcher' value for entries already in sciences
df.loc[
    [
        index
        for index in df.index
        if df.loc[index, "info_2_1"] == "researcher" and df.loc[index, "sciences"] == 1
    ],
    "info_2_1",
] = ""

# Hard-coding info_2_1 value to correctly categorize entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Mihailo_%C4%8Canak"].index
df.loc[index, "info_2_1"] = "planner"

# Dropping info_2_1 = 'researcher' value for entries already in sciences
df.loc[
    [
        index
        for index in df.index
        if df.loc[index, "info_2_1"] == "instructor" and df.loc[index, "sports"] == 1
    ],
    "info_2_1",
] = ""

# Dropping info_2_1 = 'researcher' value for entries already in sciences
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "official"],
    "info_2_1",
] = ""


# Dropping info_2_1 = 'researcher' value for entries already in sciences
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "Nobel Prize winner"],
    "info_2_1",
] = ""


# Dropping info_2_1 = 'researcher' value for entries already in sciences
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "Nobel laureate"],
    "info_2_1",
] = ""

# Dropping info_2_1 = 'researcher' value for entries already in sciences
df.loc[
    [index for index in df.index if df.loc[index, "info_2_1"] == "s"], "info_2_1",
] = ""

# Hard-coding cause_of_death for entry with value in info_2_1
index = df[df["link"] == "https://en.wikipedia.org/wiki/Eliyahu_Asheri"].index
df.loc[index, "cause_of_death"] = "murdered by militants"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [88]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [89]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 30.1 s
Wall time: 30.1 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [90]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    88920
2     8909
3      148
0       67
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [91]:
# Obtaining values for column and their counts
roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [138]:
# Code to check each value
roles_list.pop()

'owner'

<IPython.core.display.Javascript object>

In [139]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [
            index
            for index in df[df["info_2_1"].notna()].index
            if "owner" in df.loc[index, "info_2_1"]
        ],
        "info_2_1",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [140]:
# Viewing list sorted by descending length to copy to dictionary below and screen values
sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

['owner of the opposition news website',
 'muffler repair shop owner',
 'record label owner',
 'bookshop owner',
 'vineyard owner',
 'boutique owner',
 'airline owner',
 'gallery owner',
 'casino owner',
 'studio owner',
 'museum owner',
 'media owner',
 'hotel owner',
 'owner']

<IPython.core.display.Javascript object>

In [None]:
# Example code to check additional category for specific entries
df.loc[
    [
        index
        for index in df.index
        if df.loc[index, "info_2_1"] == "judge"
        and df.loc[index, "politics_govt_law"] == 0
    ],
    :,
]

In [137]:
# Example code to quick-check a specific entry
df[df["info_2_1"] == "radio"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
2430,12,Talbot Duckmanton,", 73, Australian broadcaster and radio and television administrator.",https://en.wikipedia.org/wiki/Talbot_Duckmanton,11,1995,June,,73.0,,Australia,,,2.484907,,radio,television administrator,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
6722,4,Buck Barry,", 80, American actor and radio and television personality.",https://en.wikipedia.org/wiki/Buck_Barry,5,1997,December,,80.0,,United States of America,,,1.791759,,radio,television personality,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
6905,2,Frank Muir,", 77, English comedy writer and radio and television personality.",https://en.wikipedia.org/wiki/Frank_Muir,16,1998,January,,77.0,,United Kingdom of Great Britain and Northern Ireland,,,2.833213,,radio,television personality,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
6996,17,Cliffie Stone,", 80, American musician and radio and TV personality.",https://en.wikipedia.org/wiki/Cliffie_Stone,3,1998,January,,80.0,,United States of America,,,1.386294,,radio,TV personality,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
8190,25,Cas Walker,", 96, American politician and radio and TV personality.",https://en.wikipedia.org/wiki/Cas_Walker,8,1998,September,,96.0,,United States of America,,,2.197225,,radio,TV personality,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1
11567,28,V. E. Howard,", 88, American minister and radio evangelist.",https://en.wikipedia.org/wiki/V._E._Howard,3,2000,September,,88.0,,United States of America,,,1.386294,,radio,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,1
55804,29,Buddy Moreno,", 103, American musician and radio and television personality.",https://en.wikipedia.org/wiki/Buddy_Moreno,11,2015,November,,103.0,,United States of America,,,2.484907,,radio,television personality,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
88894,3,Brother Stair,", 87, American Pentecostal evangelical pastor and radio preacher .",https://en.wikipedia.org/wiki/Brother_Stair,22,2021,April,,87.0,,United States of America,,,3.135494,,radio,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,1
93223,8,Tony MacMahon,", 82, Irish button accordion player and radio and television broadcaster, subject of .",https://en.wikipedia.org/wiki/Tony_MacMahon,14,2021,October,,82.0,,Ireland,,,2.70805,,radio,television broadcaster,,subject of,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [126]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "longtime advisor to Jimmy Carter",
    "anti apartheid campaigner",
    "prisoner of war rights campaigner",
    "anti war campaigner",
     'community leader',
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "founder of the Amadeus Quartet",
    "pioneer of video art",
    "computer art pioneer",
    "patron of the arts",
    "fetish art pioneer",
    "the Heartbreakers",
    "art impresario",
    "arts advocate",
    "news articles",
    "arts patron",
    "art patron",
    "two time Academy Award winner",
    "Academy Award winner",
     'leader of the Mary Kaye Trio',
     'orchestra leader',
    'radio'
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "founder of Spartak Moscow",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "open heart surgery pioneer",
    "earthquake expert",
    "Turing Award winner",
    "a leader in controlled fusion research",
    "group leader in the Manhattan Project",
    "a leader in satellite communications",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "son of Wal Mart founder Sam Walton",
    "co founder of Walmart",
    "chartered accountant",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "wartime commander",
    "quartermaster",
    "partisan",
    "War prisoner of war escapee",
    "prisoner of war",
    "award winner",
    "game warden",
    "war hero",
    "warlord",
     'a resistance movement leader',
     'leader of the Grey Wolves',
     'resistance leader',
     'guerrilla leader',
     'rebel leader',
    'general'
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "Dean of Hobart from to",
    "award winning Christian",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "assassin of Martin Luther King Jr",
    "member of the Cali Cartel",
    "art forger",
    "murderer",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [115]:
# Dropping entry with link that points to husband's page
index = df[df["link"] == "https://en.wikipedia.org/wiki/Jean_Vollum"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,

}

#### Extracting Category from `info_2_1`

In [None]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

#### Checking Updated `num_categories` Value Counts

In [None]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [None]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

#### Finding `known_for` Roles in `info_2_1`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "general" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to check additional category for specific entries
# df.loc[
#     [
#         index
#         for index in df.index
#         if df.loc[index, "info_2_1"] == "judge"
#         and df.loc[index, "politics_govt_law"] == 0
#     ],
#     :,
# ]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,

}

#### Extracting Category from `info_2_1`

In [None]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

#### Checking Updated `num_categories` Value Counts

In [None]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean5.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean5.db")
# df.to_sql("wp_life_expect_clean5", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 6]()