# Wikipedia Notable Life Expectancies
# [Notebook  13: Data Cleaning Part 12](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean12_thanak_2022_08_03.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean11.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean11", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98056 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98054,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98055,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
95342,14,Peter Seabrook,", 86, British gardening writer and television broadcaster, heart attack.",https://en.wikipedia.org/wiki/Peter_Seabrook,7,2022,January,,,,heart attack,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,2.079442,0,0,0,0,0,1,0,0,0,0,0,0,1
21619,25,Gad Navon,", 84, Moroccan-born Former Chief Israeli Military Rabbi, cancer.",https://en.wikipedia.org/wiki/Gad_Navon,8,2006,June,,,Former Chief Military Rabbi,cancer,,,,,,,,,84.0,,Morocco,,,2.197225,0,0,0,0,0,0,0,0,0,0,0,0,0
16115,11,Neville Colman,", 57, South African-American hematologist and forensic DNA expert, gastric cancer.",https://en.wikipedia.org/wiki/Neville_Colman,21,2003,February,,,hematologist and forensic DNA expert,gastric cancer,,,,,,,,,57.0,,South Africa,United States of America,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
23759,26,Eleanor Josephine Macdonald,", 101, American cancer researcher.",https://en.wikipedia.org/wiki/Eleanor_Josephine_Macdonald,19,2007,July,,,cancer researcher,,,,,,,,,,101.0,,United States of America,,,2.995732,0,0,0,0,0,0,0,0,0,0,0,0,0
10937,2,Bob Homme,", 81, American-Canadian television actor, known for his role as The Friendly Giant.",https://en.wikipedia.org/wiki/Bob_Homme,20,2000,May,,,,known for his role as The Friendly Giant,,,,,,,,,81.0,,United States of America,Canada,,3.044522,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98056 entries, 0 to 98055
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98056 non-null  object 
 1   name                       98056 non-null  object 
 2   info                       98056 non-null  object 
 3   link                       98056 non-null  object 
 4   num_references             98056 non-null  int64  
 5   year                       98056 non-null  int64  
 6   month                      98056 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98024 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [148]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [147]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [146]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "religion" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [145]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [144]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "economics editor"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [143]:
# Creating lists for each category
politics_govt_law = [
    "revolutionary socialist and workers' leader",
    "' leader",  # before business_farming
    "Tlingit elder",
    "Governor of Benue State",
    "veterans advocate",
    "traditional ruler of Ife",
    "ruler of Ras al Khaimah",
    "traditional ruler",
    "ruler of the",
    "ruler",
]

arts = [
    "graphic designer and pioneer in the field of computer graphics",
    "photographic director and videographer",
    "illustrator and graphic designer",
    "graphic designer and typographer",
    "television graphic designer",
    "graphic and type designer",
    "pornographic performer",
    "graphic designer",
    "gay pornographic",
    "pornographic",
    "graphic and",
    "graphic",
    "theatre and opera director and stage designer",
    "theatre director and voice coach",
    "theatre and opera administrator",
    "television and theatre director",
    "and Broadway theatre performer",
    "humorist and theatre director",
    "theatre director and designer",
    "theatre director and theorist",
    "theatre and concert director",
    "opera and theatre director",
    "theatre owner and manager",
    "theatre director and",
    "theatre impresario",
    "theatre publicist",
    "theatre director",
    "theatre designer",
    "theatre promoter",
    "theatre and",
    "theatre",
    "celebrity chef and television personality",
    "pastry chef and television personality",
    "chef and reality show contestant",
    "chef and television personality",
    "Michelin Star winning chef",
    "pioneering television chef",
    "Cajun chef and humorist",
    "famed New Orleans chef",
    "chef and gastronomist",
    "television chef",
    "celebrity chef",
    "internet chef",
    "pastry chef",
    "Creole chef",
    "music  chef",
    "head chef",
    "chef and",
    "chef",
    "stunt performer",
    "movie stuntman",
    "car customizer",
    "customizer",
    "stuntwoman",
    "BBC disc jockey and guru of the independent music scene",  # before sports
    "radio disc jockey and proponent of Pinoy rock",
    "Hall of Fame disc jockey and television host",
    "disc jockey and television personality",
    "disk jockey and sound system operator",
    "disc jockey and music news reporter",
    "disc jockey and record collector",
    "disk jockey known as 'Nightbird'",
    "disc jockey and television host",
    "game show host and disc jockey",
    "former BBC Radio disc jockey",
    "country music disc jockey",
    "BBC Radio disc jockey",
    "footwork disc jockey",
    "reggae disc jockey",
    "radio disc jockey",
    "radio disk jockey",
    "disc jockey and",
    "and disc jockey",
    "disc jockey",
    "disk jockey",
    "children book and magazine illustrator",
    "conceptual designer and illustrator",
    "illustrator for the original books",
    "concept designer and illustrator",
    "printmaker and book illustrator",
    "photo essayist and illustrator",
    "illustrator of children books",
    "illustrator and watercolorist",
    "caricaturist and illustrator",
    "illustrator and caricaturist",
    "science fiction illustrator",
    "magazine cover illustrator",
    "printmaker and illustrator",
    "children book illustrator",
    "illustrator and designer",
    "commercial illustrator",
    "comic book illustrator",
    "botanical illustrator",
    "children illustrator",
    "fashion illustrator",
    "fantasy illustrator",
    "comics illustrator",
    "manga illustrator",
    "comic illustrator",
    "book illustrator",
    "bird illustrator",
    "and illustrator",
    "illustrator and",
    "illustrator",
    "nurseryman",  # before sciences
    "correspondent and editor for United Press International",
    "editor in chief of King Features Syndicate",
    "Pulitzer Prize winning newspaper editor",
    "comic book and pulp magazine editor",
    "newspaper editor of the from until",
    "United Press International editor",
    "founding editor of stomach cancer",
    "wood carver and magazine editor",
    "sound designer and sound editor",
    "science fiction fanzine editor",
    "magazine and newspaper editor",
    "editorial page editor for the",
    "editor of black publications",
    "founding editor of magazine",
    "editor in chief of magazine",
    "Oscar winning sound editor",
    "newspaper editor in chief",
    "Composer and music editor",
    "sound designer and editor",
    "book and magazine editor",
    "science fiction editor",
    "newspaper chief editor",
    "photojournalism editor",
    "visual effects editor",
    "women magazine editor",
    "games magazine editor",
    "and newspaper editor",
    "mystery novel editor",
    "secretary and editor",
    "Disney comics editor",
    "and magazine editor",
    "book review editor",
    "managing editor of",
    "comic book editor",
    "publishing editor",
    "comic  and editor",
    "photo editor and",
    "newspaper editor",
    "magazine editor",
    "literary editor",
    "editor in chief",
    "fashion editor",
    "fiction editor",
    "sound editor",
    "photo editor",
    "music editor",
    "book editor",
    "news editor",
    "CNET editor",
    "and editor",
    "editor and",
    "editor of",
    "editor",
]
sports = [
    "professional road bicycle racer who won two stages of the Tour de",
    "Grand Prix motorcycle and short circuit road racer",
    "short circuit motorcycle road racer",
    "sport sailor and maxi yacht racer",
    "Grand Prix motorcycle road racer",
    "motorcycle and touring car racer",
    "professional road bicycle racer",
    "motor racer and IndyCar driver",
    "motorcycle sidecar road racer",
    "Hall of Fame motorcycle racer",
    "jet car driver and drag racer",
    "automobile racer and designer",
    "professional motocross racer",
    "motorcycle builder and racer",
    "Grand Prix motorcycle racer",
    "Paralympic wheelchair racer",
    "motorcycle speedway racer",
    "drag racer and crew chief",
    "racer and television host",
    "off road motorcycle racer",
    "motorcycle and auto racer",
    "Hall of Fame drag racer",
    "motorcycle rally racer",
    "motorcycle road racer",
    "Moto motorcycle racer",
    "land speed racer and",
    "motorcycle racer and",
    "motorcycle racer and",
    "motorcross racer and",
    "horse harness racer",
    "mountain bike racer",
    "road bicycle racer",
    "disabled ski racer",
    "hillclimbing racer",
    "powerboating racer",
    "cyclo cross racer",
    "touring car racer",
    "motorcycle racer",
    "alpine ski racer",
    "automobile racer",
    "wheelchair racer",
    "Alpine ski racer",
    "sprint car racer",
    "motocross racer",
    "stock car racer",
    "motorbike racer",
    "motorboat racer",
    "ski cross racer",
    "NHRA drag racer",
    "off road racer",
    "bicycle racer",
    "sidecar racer",
    "Air racer and",
    "barrel racer",
    "MotoGP racer",
    "yacht racer",
    "motor racer",
    "rally racer",
    "drag racer",
    "auto racer",
    "air racer",
    "ski racer",
    "and racer",
    "racer",
    "female jockey and pioneer in thoroughbred horse racing",
    "jockey and first woman in to receive a jockey licence",
    "National Hunt jockey and horse trainer",
    "race horse trainer and jockey mentor",
    "jockey in thoroughbred horse racing",
    "jockey in thoroughbred racing",
    "horse trainer and jockey",
    "jockey and horse trainer",
    "National Champion jockey",
    "National Hunt jockey",
    "horse racing jockey",
    "Hall of Fame jockey",
    "jockey and trainer",
    "jockey",
]
sciences = [
    "paleontologist and ornithologist",
    "ichthyologist and ornithologist",
    "ornithologist and",
    "and ornithologist",
    "ornithologist",
    "zoologist and advocate of evolutionary epistemology",
    "zoologist and neurophysiologist",
    "palaeontologist and zoologist",
    "paleontologist and zoologist",
    "soil zoologist and ecologist",
    "immunologist and zoologist",
    "zoologist and ecologist",
    "invertebrate zoologist",
    "zoologist  science",
    "medical zoologist",
    "turtle zoologist",
    "cryptozoologist",
    "and zoologist",
    "zoologist and",
    "zoologist",
    "healthcare advocate and registered nurse",
    "first nurse to earn a master degree",
    "nurse and nurse researcher",
    "nurse and nurse tutor",
    "mental health nurse",
    "nurse and nursing",
    "registered nurse",
    "Navy nurse",
    "Army nurse",
    "nurse and",
    "and nurse",
    "nurse",
    "endocrinologist and medical researcher",
    "pediatrician and medical researcher",
    "immunologist and medical researcher",
    "neurologist and medical researcher",
    "medical researcher in immunology",
    "biomedical researcher",
    "medical researcher",
]

business_farming = [
    "hotelier and casino owner",
    "hotelier and retailer",
    "hotelier and",
    "hotelier",
    "potato farmer and long distance runner",
    "farmer and landowner",
    "farmer and lobbyist",
    "rice farmer",
    "and farmer",
    "farmers'",
    "farmer",
]

academia_humanities = [
    "and Broadway theatre preservationist",  # before arts
]
law_enf_military_operator = [
    "colonel in the Army and of the most decorated women in military history",
    "Army General who commanded military operations in the War from to",
    "paramilitary intelligence chief and clandestine agent",
    "National Liberation Army paramilitary leader",
    "Sandinista guerrilla and military leader",
    "scientific military intelligence expert",
    "Prime Minister of and military leader",
    "Northern Alliance military commander",
    "Serb military commander in the War",
    "military communications listener",
    "military commander and warlord",
    "republican paramilitary leader",
    "military and security official",
    "head of military intelligence",
    "warlord and military figure",
    "Serb paramilitary commander",
    "Biafran military commander",
    "Chetnik military commander",
    "Hutu paramilitary leader",
    "ISIL military commander",
    "and paramilitary leader",
    "and military commander",
    "paramilitary commander",
    "military commander and",
    "era military commander",
    "military intelligence",
    "loyalist paramilitary",
    "military interpreter",
    "military veteran and",
    "military leader and",
    "paramilitary leader",
    "marine and military",
    "military commander",
    "military  designer",
    "military official",
    "military veteran",
    "and paramilitary",
    "military leader",
    "military office",
    "Hamas military",
    "military chief",
    "paramilitary",
    "military man",
    "military",
]

spiritual = ["religion and apologetics", "of religion", "religion"]
social = [
    "animal welfare campaigner",
    "social worker and Righteous Among the Nations",
    "youth social worker",
    "and social worker",
    "social worker",
    "charity fundraiser",
]
crime = ["and forger", "forger", "drug lord"]
event_record_other = [
    "chef and construction worker",  # before arts
    "construction worker",
    "ebola survivor",
    "anthrax attack victim",
]
other_species = []

<IPython.core.display.Javascript object>

In [150]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

<IPython.core.display.Javascript object>

In [149]:
# Hard-coding cause_of_death for entry with value in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Hideo_Ogata"].index
df.loc[index, "cause_of_death"] = "stomach cancer"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [151]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "politics_govt_law": politics_govt_law,
    "business_farming": business_farming,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sciences": sciences,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [152]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 3min 30s
Wall time: 3min 30s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
94152,22,Kim Friele,", 86, Norwegian LGBT rights activist.",https://en.wikipedia.org/wiki/Kim_Friele,6,2021,November,,,,,,,,,,,,,86.0,,Norway,,,1.94591,0,0,0,0,0,0,0,0,1,0,0,0,1
85059,4,Ken Hensley,", 75, English singer-songwriter .",https://en.wikipedia.org/wiki/Ken_Hensley,10,2020,November,"Uriah Heep, Blackfoot, Toe Fat",,,,,,,,,,,,75.0,,United Kingdom of Great Britain and Northern Ireland,,"Uriah Heep, Blackfoot, Toe Fat",2.397895,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [153]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 9762 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [191]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [190]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [189]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "Jesuit" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [188]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [187]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "press director"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [186]:
# Creating lists for each category
politics_govt_law = [
    "managing director of the Abu Dhabi Investment Authority",  # before arts
    "director of Office of Telecommunications Policy",
    "director of the Mint from to",
    "Native advocate",
    "Peace",
]

arts = [
    "director of most episodes of Monty Python Flying Circus",
    "theater director who staged plays on and off Broadway",
    "publishing director of Burke Peerage Limited",
    "opera director and set and costume designer",
    "theatrical director and opera librettist",
    "director of John H Johnson Fashion Fair",
    "music director and music group founder",
    "cameraman and director of photography",
    "director and lyricist in the language",
    "operatic baritone and opera director",
    "television and music video director",
    "opera director and administrator",
    "festival director and cover girl",
    "managing director of BBC Radio",
    "organist and musical director",
    "television and radio director",
    "organist and choral director",
    "rock and roll tour director",
    "Emmy Award winning director",
    "short  and casting director",
    "theater and opera director",
    "opera director and manager",
    "opera director and hazzan",
    "public relations director",
    "theater director and mime",
    "radio program director",
    "marching band director",
    "college band director",
    "music video director",
    "operatic ic director",
    "commercial director",
    "theatrical director",
    "assistant director",
    "festival director",
    "Broadway director",
    "and news director",
    "director of Radio",
    "theater director",
    "casting director",
    "company director",
    "B movie director",
    "gallery director",
    "screen director",
    "choral director",
    "design director",
    "opera director",
    "music director",
    "movie director",
    "media director",
    "anime director",
    "radio director",
    "voice director",
    "press director",
    "band director",
    "set director",
    "director and",
    "director",
    "organist of the St Peter Basilica in Rome",
    "harpsichordist and organist",
    "organist and harpsichordist",
    "chorister and organist",
    "classical organist and",
    "cantor and organist",
    "cathedral organist",
    "classical organist",
    "concert organist",
    "stadium organist",
    "organist and",
    "organist",
]
sports = [
    "track and field coach and athletic director",  # before arts
    "manager and director in the Football League",
    "collegiate athletic director",
    "college athletic director",
    "runner and race director",
    "athletic director",
    "sporting director",
]
sciences = [
    "NASA mission director",  # before arts
]

business_farming = [
    "toy manufacturer and managing director of Lego",  # before arts
    "managing director of Ulsterbus and Citybus",
    "business director",
    "funeral director",
]
academia_humanities = [
    "deputy director of the National Air and Space Museum",  # before arts
    "director of the National Gallery of Art from to",
    "director of the Metropolitan Museum of Art",
    "director of the Cleveland Museum of Art",
    "oral history archive director",
    "curator and museum director",
    "library director",
    "museum director",
]
law_enf_military_operator = [
    "Army Lieutenant General and director of the National Security Agency",  # before arts
    "first director of the Coast Guard Women Reserve",
    "former CIA director",
    "director of the FBI",
]
spiritual = [
    "church music director and concert", # before arts
    "Jesuit monk and",
    "Jesuit cleric",
    "Jesuit",
]
social = [
    "director of the Peace Corps",  # before arts
    "Peace Corps",  # before politics_govt_law
]
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

In [192]:
# Hard_coding info_1 value to capture spiritual for entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Pasquale_Borgomeo"].index
df.loc[index, "info_1"] = "priest"

# Hard_coding info_2 value to capture business for entry
index = df[df["link"] == "https://en.wikipedia.org/wiki/Jim_Service"].index
df.loc[index, "info_2"] = "business director"  # added to dictionary

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [193]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
    "arts": arts,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [194]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 3s
Wall time: 1min 3s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
76982,2,Michael Bauman,", 69, American theologian.",https://en.wikipedia.org/wiki/Michael_Bauman,4,2019,October,,,,,,,,,,,,,69.0,,United States of America,,,1.609438,0,0,1,0,0,0,0,0,0,0,0,0,1
90907,24,David Lee Hunter,", 87, American mathematician.",https://en.wikipedia.org/wiki/David_Lee_Hunter,5,2021,June,,,,,,,,,,,,,87.0,,United States of America,,,1.791759,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [195]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 9538 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [297]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [296]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [295]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "bounty hunter" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [294]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [298]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "bounty hunter and reality television personality"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [299]:
# Creating lists for each category
politics_govt_law = [
    "deaf rights advocate",
    "government official and gun control advocate",
    "consumer advocate and government official",
    "local government official",
    "and government official",
    "government official",
]

arts = [
    "radio preacher",
    "entertainer and television personality",
    "and reality television personality",
    "radio and television personality",
    "reality television personality",
    "and television personality",
    "television personality",
]
sports = [
    "national hockey team and Pittsburgh Penguins coach",
    "ice hockey trainer and equipment manager",
    "ice hockey Hall of Fame player and coach",
    "Detroit Red Wings hockey player in the s",
    "women baseball and field hockey player",
    "field hockey player and administrator",
    "professional hockey player and coach",
    "ice hockey administrator and referee",
    "ice hockey referee and administrator",
    "professional ice hockey defenseman",
    "Hall of Fame field hockey player",
    "Hall of Fame ice hockey linesman",
    "field hockey player and manager",
    "roller hockey player and coach",
    "field hockey player and coach",
    "ice hockey coach and manager",
    "field hockey representative",
    "professional hockey player",
    "ice hockey administrator",
    "college ice hockey coach",
    "hockey player and coach",
    "NHL ice hockey referee",
    "ice hockey goaltender",
    "ice hockey defenceman",
    "sledge hockey player",
    "field hockey player",
    "ice hockey referee",
    "ice hockey coach",
    "hockey official",
    "hockey referee",
    "hockey player",
    "field hockey",
    "ice hockey",
    "hockey",
    "competitive figure skater as a teenager",
    "figure skater and figure skating coach",
    "pair skater and figure skating",
    "roller derby skater and coach",
    "short track speed skater",
    "long distance ice skater",
    "figure skater and coach",
    "skate and snowboarder",
    "skateboard innovator",
    "Roller derby skater",
    "roller derby skater",
    "figure skater",
    "speed skater",
    "pair skater",
    "skater",
    "weightlifter and fitness centre owner",
    "champion Paralympic weightlifter",
    "world champion weightlifter",
    "heavyweight weightlifter",
    "weightlifter",
    "sailor and nightclub owner",
    "sailor and adventurer",
    "sailboat designer",
    "sailor and coach",
    "land sailor",
    "sailor and",
    "assailant",
    "sailor",
]
sciences = [
    "sailplane designer and pioneer",  # before sports
    "ichthyologist",
    "yacht designer",
    "sceptic",
    "immunologist and eye tissue transplant researcher",
    "gastroenterologist and immunologist",
    "immunologist and cancer researcher",
    "immunologist and AIDS researcher",
    "pathologist and immunologist",
    "virologist and immunologist",
    "cancer immunologist",
    "immunologist",
    "aerodynamics expert at",
    "aeroplane designer",
    "aerospace pioneer",
    "aerodynamicist",
    "aerospace",
    "aero",
]

business_farming = [
    "diamond merchant",
    "wine collector and dealer",
    "wine collector and dealer",  # before academia_humanities
    "pastoral and tourism pioneer",  # before spiritual
    "financier and venture capitalist",
    "venture capitalist and financier",
    "property developer and financier",
    "billionaire financier",
    "corporate  financier",
    "and a financier",
    "financier and",
    "and financier",
    "financier",
    "pawnbroker",
]
academia_humanities = [
    "ichthyologist and musical instrument collector",
    "collector of Harry Houdini memorabilia",
    "diamond merchant and antique collector",
    "optical illusion collector and sceptic",
    "and music collector",
    "toy car collector",
    "antique collector",
    "record collector",
    "book collector",
    "collector",
]
law_enf_military_operator = [
    "sea captain sailor",  # before sports
    "navy sailor",
    "bounty hunter",
]
spiritual = [
    "evangelist and pastor of the Worldwide Church of God",
    "pastor at the University Baptist Church in Waco",
    "Pentecostal evangelical pastor and",
    "Evangelical Lutheran pastor",
    "Independent Baptist pastor",
    "Baptist megachurch pastor",
    "pastor and evangelist",
    "pastor and exorcist",
    "Pentecostal pastor",
    "evangelical pastor",
    "Protestant pastor",
    "megachurch pastor",
    "pastoral theology",
    "reformist pastor",
    "Christian pastor",
    "Baptist pastor",
    "gospel  pastor",
    "senior pastor",
    "pastor and",
    "and pastor",
    "pastor",
]
social = []
crime = ["assailant", "fugitive from justice", "fugitive"]  # before sports
event_record_other = [
    "potato chip collector",  # before academia_humanities
]
other_species = [
    "skateboarding bulldog",  # before sports
]

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

In [300]:
# Hard-coding info_2 value to capture military for entries
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/John_Leake_(NAAFI_manager)"
].index
df.loc[index, "info_2"] = "navy sailor"  # added to dict

index = df[df["link"] == "https://en.wikipedia.org/wiki/Ted_Briggs"].index
df.loc[index, "info_2"] = "navy sailor"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Robert_Walker_(sailor)"].index
df.loc[index, "info_2"] = "navy sailor"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Robert_Stinnett"].index
df.loc[index, "info_2"] = "navy sailor"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Molly_Kool"].index
df.loc[index, "info_2"] = "sea captain sailor"  # added to dict

index = df[df["link"] == "https://en.wikipedia.org/wiki/Susan_Clark_(sailor)"].index
df.loc[index, "info_2"] = "sea captain sailor"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [301]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [302]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 1min 28s
Wall time: 1min 28s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
5525,29,Pupul Jayakar,", 81, Indian cultural activist and writer.",https://en.wikipedia.org/wiki/Pupul_Jayakar,13,1997,March,,,,,,,,,,,,,81.0,,India,,,2.639057,0,0,0,0,0,1,0,0,1,0,0,0,2
8165,19,Ran Laurie,", 83, British physician, Olympic gold medallist and father of Hugh Laurie, Parkinson's disease.",https://en.wikipedia.org/wiki/Ran_Laurie,11,1998,September,,,,Olympic gold medallist and father of Hugh Laurie,Parkinson disease,,,,,,,,83.0,,United Kingdom of Great Britain and Northern Ireland,,,2.484907,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [303]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 9016 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [527]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [526]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [528]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "confectioner" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [524]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [523]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "nun and confectioner"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [522]:
# Creating lists for each category
politics_govt_law = [
    "neo fascist",
    "barrister and",
    "barrister",
    "viceroy",
    "princess and a Muhammad Ali Dynasty member",
    "princess and grandmother of King Felipe VI",
    "princess and granddaughter of Mehmed V",
    "princess of the House of Savoy",
    "princess and expatriate",
    "Sealandic princess",
    "and princess",
    "princess",
    "anti torture advocate",
]

arts = [
    "science fiction and multi winner",
    "science fiction bibliographer",
    "science fiction expert",
    "science fiction and",
    "science fiction fan",
    "science fiction",
    "lyricist known for writing musicals with Adolph Green including",
    "country music performer and member of the Grand Ole Opry",
    "music manager and spokesperson for the Bay City Rollers",
    "electronic music programmer and keyboardist",
    "costume designer of Broadway musicals",
    "classical music radio program host",
    "talent manager and music promoter",
    "founder of exotica musical genre",
    "big band and pop music arranger",
    "music venue owner and promoter",
    "calypsonian and music promoter",
    "bandleader and music arranger",
    "country music and rodeo star",
    '"father of bluegrass" music',
    "classical music impresario",
    "country music entertainer",
    "music promoter and agent",
    "electronic music pioneer",
    "country music performer",
    "percussionist and music",
    "music manager and agent",
    "classical music manager",
    "father of Chicano music",
    "pioneer of Celtic music",
    "and musical instrument",
    "musical administrator",
    "folk music researcher",
    "country music fiddler",
    "music hall performer",
    "music  administrator",
    "folk music promoter",
    "rock music promoter",
    "performer of music",
    "country music star",
    "traditional music",
    "cellist and music",
    "musical arranger",
    "music researcher",
    "musical lyricist",
    "music publicist",
    "music video and",
    "classical music",
    "music education",
    "music promoter",
    "music arranger",
    "music manager",
    "music website",
    "musical agent",
    "music expert",
    "music patron",
    "folk music",
    "and music",
    "musical",
    "music",
    "carnival designer",
    "violist and cellist",
    "classical cellist",
    "and cellist",
    "cellist",
    "Hall of Fame talk radio host",
    "television and radio host",
    "Hall of Fame radio host",
    "talk radio host and",
    "talk radio host",
    "and radio host",
    "radio host and",
    "radio host",
    "ballerina and ballet mistress",
    "ballerina and",
    "ballerina",
    "tenor saxophone player",
    "operatic lyric tenor",
    "operatic tenor",
    "countertenor",
    "opera tenor",
    "lyric tenor",
    "heldentenor",
    "tenor and",
    "tenor",
    "microwave cooking consultant",
    "TV cooking show host",
    "cooking show host",
    "television cook",
    "and cook",
    "cook",
]
sports = [
    "world champion bridge player",
    "professional bridge player",
    "contract bridge player",
    "bridge player",
    "thoroughbred horse breeder",
    "and horse breeder",
    "horse breeder",
    "Paralympic wheelchair curler",
    "world champion curler",
    "Hall of Fame curler",
    "curler",
    "middle distance runner and Commonwealth Games gold medallist",
    "middle distance runner and former world record holder",
    "middle distance runner and",
    "middle and long distance runner",
    "orienteer and mountain runner",
    "marathon and triathlon runner",
    "long distance runner and",
    "middle distance runner",
    "long distance runner",
    "steeplechase runner",
    "orienteering runner",
    "runner and coach",
    "distance runner",
    "sprint runner",
    "fell runner",
    "runner",
    "judoka",
    "race car driver and member of the NASCAR Hall of Fame",
    "NASCAR driver and ARCA race car driver owner",
    "Formula One and Grand Prix race car driver",
    "race car driver and hot rod builder",
    "race car driver and team owner",
    "Hall of Fame race car driver",
    "race car driver and mechanic",
    "Formula One race car driver",
    "race car driver and owner",
    "NASCAR race car driver",
    "race car driver and",
    "race car driver",
    "Baseball player who was the first to come out as gay",
    "Baseball player for the Philadelphia Athletics",
    "Baseball player and manager",
    "Baseball player and coach",
    "former Baseball player",
    "Baseball player",
    "Hall of Fame gymnast",
    "gymnastics coach",
    "rhythmic gymnast",
    "gymnast",
    "racecar driver and member of the NASCAR Hall of Fame",
    "professional racecar driver",
    "and racecar driver",
    "racecar driver",
    "baseball umpire and supervisor",
    "Hall of Fame baseball umpire",
    "baseball umpire",
    "Hall of Fame badminton player",
    "badminton player and coach",
    "badminton player and",
    "badminton player",
    "Test cricket umpire",
    "test cricket umpire",
    "cricket umpire",
]
sciences = [
    "paleontologist who revolutionized understanding of dinosaurs",
    "entomologist and paleontologist",
    "malacologist and paleontologist",
    "vertebrate paleontologist",
    "paleontologist",
    "arachnologist and myriapodologist",
    "entomologist and arachnologist",
    "arachnologist and",
    "arachnologist",
    "cardiologist who invented the technique of coronary bypass surgery",
    "cardiologist and expert on hypertension",
    "paediatric cardiologist",
    "pediatric cardiologist",
    "cardiologist",
    "child psychoanalyst",
    "psychoanalyst",
    "gynaecologist who is among the oldest men to have fathered a child",
    "gynaecologist and reproductive medicine researcher",
    "obstetrician and gynaecologist",
    "gynecologist and obstetrician",
    "obstetrician and gynecologist",
    "entomologist and ecologist",
    "statistical ecologist",
    "landscape ecologist",
    "and media ecologist",
    "plant ecologist",
    "paleoecologist",
    "deep ecologist",
    "gynaecologist",
    "geo ecologist",
    "gynecologist",
    "ecologist",
    "virologist credited with eradicating polio in",
    "epidemiologist and virologist",
    "virologist and paediatrician",
    "plant virologist",
    "virologist",
    "criminologist",
    "pharmacologist and biodynamic agriculturalist",
    "Nobel Prize winning pharmacologist",
    "physiologist and pharmacologist",
    "pharmacologist and physiologist",
    "behavioral pharmacologist",
    "clinical pharmacologist",
    "psychopharmacologist",
    "neuropharmacologist",
    "pharmacologist",
    "pathologist who specialized in sickle cell anemia and hematology",
    "pathologist and cancer researcher",
    "neurologist and neuropathologist",
    "pathologist and toxicologist",
    "veterinary pathologist",
    "paediatric pathologist",
    "forensic pathologist",
    "chemical pathologist",
    "cancer pathologist",
    "dermatopathologist",
    "animal pathologist",
    "plant pathologist",
    "phytopathologist",
    "oral pathologist",
    "neuropathologist",
    "pathologist and",
    "pathologist",
    "sexologist and psychotherapist",
    "child psychotherapist",
    "psychotherapist",
]

business_farming = ["founder of PowerBar", "winemaker and", "winemaker", "confectioner"]
academia_humanities = [
    "library",
    "emeritus curator at the Smithsonian Institution National Museum of Natural History",
    "archeologist and former curator at the Smithsonian Institution",
    'museum curator and one of the "Monuments Men"',
    "manuscripts curator at the Museum",
    "Smithsonian Institution curator",
    "egyptologist and curator",
    "educationist and curator",
    "curator of contemporary",
    "curator of paintings",
    "photography curator",
    "and museum curator",
    "museum curator",
    "and curator",
    "curator",
]
law_enf_military_operator = [
    "astronaut in Mercury",
    "candidate astronaut",
    "former astronaut",
    "NASA astronaut",
    "astronaut",
    "resistant",
    "air marshal and Director General of Intelligence",
    "air marshal and George Cross recipient",
    "Air Force air marshal",
    "air marshall",
    "air marshal",
]
spiritual = [
    "Christianity preacher and gospel",
    "prelate from  Catholic Association",
    "prelate of the Catholic Church",
    "prelate in the Catholic Church",
    "catholic prelate and cardinal",
    "Catholic clandestine prelate",
    "Catholic laicized prelate",
    "Catholic Cardinal prelate",
    "Orthodox Old Rite prelate",
    "Eastern Orthodox prelate",
    "United Methodist prelate",
    "Church of South prelate",
    "Episcopalian prelate",
    "Catholic ex prelate",
    "Episcopal prelate",
    "Apostolic prelate",
    "Mar Thoma prelate",
    "episcopal prelate",
    "Church of prelate",
    "Angelican prelate",
    "Orthodox prelate",
    "Lutheran prelate",
    "Maronite prelate",
    "Coptic prelate",
    "Jewish prelate",
    "Mormon prelate",
    "prelate",
    "biblical",
    "Catholic nun and",
    "Apostolic nuncio",
    "Benedictine nun",
    "Poor Clare nun",
    "Catholic nun",
    "and nun",
    "nun",
]
social = [
    "and confectioner",  # before business_farming
]
crime = [
    "terrorist involved in the Glasgow International Airport attack",
    "founder and commander in chief of terrorist organization FARC",
    "terrorist and a commander of Abu Sayyaf",
    "islamist terrorist group leader",
    "Arabian suspected terrorist",
    "terrorist in Bali bombings",
    "al Qaeda terrorist",
    "domestic terrorist",
    "Arabian terrorist",
    "and terrorist",
    "terrorist",
]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [529]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [304]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "general" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

In [None]:
# # Example code to quickly sort list in correct descending length search order to copy to dictionary
# temp = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)
# temp

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean12.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean12.db")
# df.to_sql("wp_life_expect_clean12", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 13]()