# Wikipedia Notable Life Expectancies
# [Notebook 8: Data Cleaning Part 7](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean7_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean6.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean6", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98060 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,economist,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98058,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98059,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
69373,7,Al Capps,", 79, American record producer, arranger and composer.",https://en.wikipedia.org/wiki/Al_Capps,5,2018,June,,,record producer,arranger and composer,,,,,,,,,79.0,,United States of America,,,1.791759,0,0,0,0,0,0,0,0,0,0,0,0,0
69476,13,Tom Gear,", 69, American politician, member of the Virginia House of Delegates .",https://en.wikipedia.org/wiki/Tom_Gear,15,2018,June,,,,member of the Virginia House of Delegates,,,,,,,,,69.0,,United States of America,,2002 2010,2.772589,0,0,0,0,0,0,0,0,1,0,0,0,1
57812,8,Elizabeth Roemer,", 87, American astronomer.",https://en.wikipedia.org/wiki/Elizabeth_Roemer,8,2016,April,,,astronomer,,,,,,,,,,87.0,,United States of America,,,2.197225,0,0,0,0,0,0,0,0,0,0,0,0,0
34207,29,Jim Seymour,", 64, American football player .",https://en.wikipedia.org/wiki/Jim_Seymour_(American_football),5,2011,March,Chicago Bears,,,,,,,,,,,,64.0,,United States of America,,Chicago Bears,1.791759,0,0,0,0,0,0,1,0,0,0,0,0,1
40341,20,Jaouad Akaddar,", 28, Moroccan footballer, cardiac arrest.",https://en.wikipedia.org/wiki/Jaouad_Akaddar,4,2012,October,,,,cardiac arrest,,,,,,,,,28.0,,Morocco,,,1.609438,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98060 entries, 0 to 98059
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98060 non-null  object 
 1   name                       98060 non-null  object 
 2   info                       98060 non-null  object 
 3   link                       98060 non-null  object 
 4   num_references             98060 non-null  int64  
 5   year                       98060 non-null  int64  
 6   month                      98060 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98028 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [10]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [49]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [48]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "Catholic prelate" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [47]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [50]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "defrocked" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [51]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "defrocked Catholic prelate"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [52]:
# Creating lists for each category
politics_govt_law = [
    "Patriotic",
]

arts = []
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = [
    "Syro Malabar Catholic prelate",
    "Eastern Catholic prelate",
    "clandestine Catholic prelate",
    "Old Catholic prelate",
    "Catholic prelate and theologian",
    "Catholic prelate and first cardinal",
    "Maronite Catholic prelate",
    "Catholic prelate and Cardinal",
    "Coptic Catholic prelate",
    "Catholic prelate and bishop",
    "Catholic prelate and cardinal",
    "Catholic prelate and",
    "and Catholic prelate",
    "Catholic prelate",
]
social = []
crime = [
    "defrocked",
]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [53]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [54]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['spiritual'] ==1].sample(2)

CPU times: total: 8.34 s
Wall time: 8.32 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
53450,27,Peter Celestine Elampassery,", 76, Indian Roman Catholic prelate, Bishop of Jammu-Srinagar .",https://en.wikipedia.org/wiki/Peter_Celestine_Elampassery,9,2015,May,,,,Bishop of Jammu Srinagar,,,,,,,,,76.0,,India,Italy,1998 2014,2.302585,0,0,1,0,0,0,0,0,0,0,0,0,1
63852,2,Leon Lemmens,", 63, Belgian Roman Catholic prelate, Auxiliary Bishop of Mechelen-Brussels , leukemia.",https://en.wikipedia.org/wiki/Leon_Lemmens,10,2017,June,since,,,Auxiliary Bishop of Mechelen Brussels,leukemia,,,,,,,,63.0,,Belgium,Italy,since 2011,2.397895,0,0,1,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [55]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 44902 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [56]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [263]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [261]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "physicist" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [259]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [260]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [index for index in df.index if "physicist and science" in df.loc[index, "info"]]
# ]

<IPython.core.display.Javascript object>

In [264]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "health physicist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [265]:
# Creating lists for each category
politics_govt_law = ["arms control expert", "refusenik"]

arts = []
sports = []
sciences = [
    "physicist and group leader in the Manhattan Project",
    "physicist and molecular biologist",
    "physicist and computer research executive",
    "Nobel Prize winning biophysicist",
    "physicist and grandson of Yuan Shikai",
    "physicist and polymer researcher",
    "physicist and hydrodynamicist",
    "nuclear physicist and inventor",
    "physicist and a leader in controlled fusion research",
    "computational physicist and the father of plasma based acceleration techniques",
    "biophysicist and theoretical ecologist",
    "physicist and co winner of Nobel Prize in Physics in",
    "leading physicist in the study of waves",
    "physicist and Nobel laureate",
    "physicist ane engineer",
    "astrophysicist and radio astronomer",
    "physicist and artificial intelligence pioneer",
    "physicist and civil engineer",
    'physicist who coined the term "black hole"',
    "physicist who co discovered the Wigner Seitz cell",
    "physicist and former director of SLAC",
    "physicist at Uppsala University",
    "physicist who won the Nobel Prize for Physics in",
    "physicist who built the first laser",
    "physicist who was a pioneer of solid state physics",
    "molecular biophysicist and crystallographer",
    "physicist and member of the Manhattan Project",
    "physicist and color scientist",
    "marine geologist and geophysicist",
    "theoretical physicist and astronomer",
    "physicist and electronics engineer",
    "physicist and inventor of the first digital computer",
    "pioneering biophysicist and virologist",
    "physicist Nobel Prize in Physics laureate",
    "theoretical physicist and magneto ionic theory pioneer",
    "theoretical physicist and Nobel Prize laureate",
    "physicist and jet engine designer",
    "geophysicist and oceanographer",
    "physicist and winner of the Nobel Prize in Physics",
    "physicist known for the Casimir effect",
    "nuclear physicist who worked at the Manhattan Project Metallurgical Laboratory",
    "physicist and team member of the Manhattan Project",
    "biophysicist and biochemist",
    "experimental physicist and scientist",
    "physicist and radiation health physics pioneer",
    "physicist and co inventor of the laser with Charles Townes",
    "nuclear engineer and physicist",
    'physicist known as "the father of Pulsed Power"',
    "physicist and physical chemist",
    "chemist and nuclear physicist",
    "physicist and recipient of the Nobel Prize in Physics",
    "differential geometer and mathematical physicist",
    "physicist and statistician",
    "physicist and father of Joan Baez and Mimi Fariña",
    "nuclear physicist and engineer",
    "physicist and microbiologist",
    "nuclear physicist and ufologist",
    "chemist and biophysicist",
    "condensed matter physicist",
    "physicist and researcher",
    "geophysicist and structural geologist",
    "mesoscopic physicist",
    "physicist and specialist in solid state laser",
    "physicist and aircraft designer",
    "physicist specialized in theoretical catalysis",
    "biologist and biophysicist",
    "thermal physicist",
    "atomic physicist",
    "biophysicist and science",
    "research physicist",
    "theoretical physicist and nuclear engineer",
    "neurophysicist",
    "experimental nuclear physicist",
    "health physicist",
    "physicist and parapsychologist",
    "physicist and skeptic",
    "solid state physicist",
    "biophysicist and virologist",
    "atmospheric physicist",
    "physicist and geneticist",
    "electrical engineer and physicist",
    "climate physicist",
    "nuclear and particle physicist",
    "explosives engineer and physicist",
    "physicist and neurobiologist",
    "mathematical physicist and cosmologist",
    "metallurgist and physicist",
    "mathematical geophysicist and seismologist",
    "East physicist",
    "theoretical physicist and astrophysicist",
    "Nobel Prize winning physicist",
    "optical physicist",
    "metal physicist",
    "metal and detonation physicist",
    "solar physicist",
    "oceanographic physicist",
    "geophysicist and planetary scientist",
    "astroparticle physicist",
    "accelerator physicist",
    "engineer and physicist",
    "molecular biophysicist",
    "physicist and radio astronomer",
    "physicist and meteorologist",
    "physicist and computer scientist",
    "astronomer and physicist",
    "physicist and chemist",
    "chemical physicist",
    "physicist and electrical engineer",
    "physicist and astronomer",
    "astronomer and astrophysicist",
    "medical physicist",
    "space physicist",
    "plasma physicist",
    "chemist and physicist",
    "physicist and inventor",
    "experimental physicist",
    "physicist and engineer",
    "mathematical physicist",
    "particle physicist",
    "biophysicist",
    "geophysicist",
    "and nuclear physicist",
    "nuclear physicist and",
    "nuclear physicist",
    "and astrophysicist",
    "astrophysicist and",
    "astrophysicist",
    "and theoretical physicist",
    "theoretical physicist and",
    "theoretical physicist",
    "physicist and",
    "and physicist",
    "physicist",
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [267]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [268]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 1min 8s
Wall time: 1min 8s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
80861,13,Jens Erik Fenstad,", 84, Norwegian mathematician, COVID-19.",https://en.wikipedia.org/wiki/Jens_Erik_Fenstad,7,2020,April,,,,COVID,,,,,,,,,84.0,,Norway,,,2.079442,1,0,0,0,0,0,0,0,0,0,0,0,1
10037,27,Robert Mills,", 72, American physicist.",https://en.wikipedia.org/wiki/Robert_Mills_(physicist),8,1999,October,,,,,,,,,,,,,72.0,,United States of America,,,2.197225,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [269]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 44009 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [271]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [273]:
# Code to check each value
roles_list.pop()

'architect'

<IPython.core.display.Javascript object>

In [274]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [index for index in df.index if "architect" in df.loc[index, "info"]], "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [406]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [407]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "architect and art" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [408]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "naval architect"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [409]:
# Creating lists for each category
politics_govt_law = ["asylum seeker"]

arts = [
    "architectural and interior designer",
    "photographer and architect",
    "Southwestern style architect",
    "architectural lighting designer",
    "church architect and Gothic Revival designer",
    "medieval architectural",
    "architect and organ designer",
    "landscape and garden architect",
    "set costume designer and architect",
    "architect and raconteur",
    "architect in Oregon",
    "architect of perestroika",
    "architect and acoustician",
    "architecture critic for",
    "interior designer and architect",
    "architect and historic",
    "architect and photographer",
    "furniture designer and architect",
    "architect and interior designer",
    "architect and graphic designer",
    "architect and art collector",
    "and course architect",
    "horticultural architect",
    "architect and designer of the flag of",
    "architect and furniture designer",
    "furniture designer and interior architect",
    "architect and landscape architect",
    "architecture and blues",
    "temple architect and sculptor",
    "bridge architect",
    "architectural critic",
    "industrial designer and architect",
    "architect and caveman",
    "literature and architecture",
    "space architect and spaceport planner",
    "architect and industrial designer",
    "town planner and architect",
    "architectural photographer",
    "architect and sculptor",
    "potter and architect",
    "architecture critic",
    "naval architect",
    "architect and urban designer",
    "golf course architect",
    "designer and architect",
    "architect and architectural",
    "sculptor and architect",
    "architect and town planner",
    "modernist architect",
    "architect and urban planner",
    "architect and designer",
    "landscape architect and",
    "landscape architect",
    "and architectural",
    "architectural",
    "of architecture",
    "and restoration architect",
    "architecture",
    "architect and",
    "and architect",
    "architect",
]
sports = []
sciences = [
    "computer architect and high tech",  # before arts
]

business_farming = []
academia_humanities = [
    "antique and architecture preservationist",  # before arts
]
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [410]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [411]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 32.3 s
Wall time: 32.3 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
61274,21,Michèle Morgan,", 96, French film actress .",https://en.wikipedia.org/wiki/Mich%C3%A8le_Morgan,54,2016,December,", ,",,,,,,,,,,,,96.0,,France,,", ,",4.007333,0,0,0,0,0,1,0,0,0,0,0,0,1
8156,17,Geoffrey Dutton,", 76, Australian author and historian.",https://en.wikipedia.org/wiki/Geoffrey_Dutton,13,1998,September,,,,,,,,,,,,,76.0,,Australia,,,2.639057,0,0,0,1,0,1,0,0,0,0,0,0,2


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [412]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 43450 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [414]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [559]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [417]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "photographer" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [558]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [556]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "photographer of the" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [555]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "public relations executive and photographer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [554]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "crime photographer",
    "photographer and illustrator",
    "adult photographer",
    "photographer of children",
    'photographer who pioneered "environmental portraiture"',
    "photographer at the fall of Saigon",
    "photographer of indigenous peoples in",
    "underwater photographer and filmmaker",
    "photographer and founder of",
    "photographer and editor",
    "photographer and camera operator",
    "underwater nature photographer",
    "photographer and news executive",
    "fashion and portrait photographer",
    "photographer based in San Francisco",
    "photographer born in Mérida",
    "photographer and secret FBI",
    "photographer during World War II",
    "glamour photographer and director of pornographic films",
    "photographer and war correspondent",
    "newspaper photographer",
    "double bassist and photographer",
    "photographer and photo essayist",
    "portrait photographer",
    "wilderness photographer",
    "press photographer",
    "music producer and photographer",
    "publisher and photographer",
    "photographer and art critic",
    "photographer and ballet dancer",
    "photographer and publicist",
    "photographer and theatre director",
    "engraver and photographer",
    "advertising photographer",
    "erotic photographer",
    "graphic designer and photographer",
    "photographer and art director",
    "public relations executive and photographer",
    "punk rock and art photographer",
    "fine art photographer",
    "photographer and documentary filmmaker",
    "environmental photographer",
    "photographer and blogger",
    "photographer and cinematographer",
    "photographer and biographer",
    "printmaker and photographer",
    "jazz and blues photographer",
    "commercial photographer",
    "aerial photographer and director",
    "newspaper and magazine photographer",
    "photographer and film maker",
    "filmmaker and photographer",
    "photographer and graphic designer",
    "street photographer",
    "photographer and model",
    "model and photographer",
    "news photographer",
    "and wildlife photographer",
    "wildlife photographer",
    "jazz photographer",
    "aerial photographer",
    "documentary photographer",
    "celebrity photographer",
    "photographer and filmmaker",
    "art photographer",
    "Pulitzer Prize winning photographer",
    "fashion photographer and",
    "fashion photographer",
    "photographer of the",
    "and Holocaust photographer",
    "photographer and",
    "and photographer",
    "photographer",
]
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [560]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [561]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 37 s
Wall time: 37 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
54073,13,J. R. Gach,", 63, American radio personality, diabetes.",https://en.wikipedia.org/wiki/J._R._Gach,6,2015,July,,,,diabetes,,,,,,,,,63.0,,United States of America,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
45890,11,Garry Robbins,", 56, Canadian professional wrestler and actor , heart attack.",https://en.wikipedia.org/wiki/Garry_Robbins,6,2013,December,", ,",,professional wrestler,heart attack,,,,,,,,,56.0,,Canada,,", ,",1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [562]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 42922 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [564]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [692]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [691]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "economist" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [690]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [688]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "health economist" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [687]:
# # Example code to quick-check a specific entry
# df[
#     df["info_2"]
#     == "economist who did pioneering research in linear programming and environmental economics"
# ]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [686]:
# Creating lists for each category
politics_govt_law = [
    "jurist and economist",
    "Marxian economist and founding editor of the",
    "economist specializing in public economics and information economics",
    "supply side economist",
    "libertarian economist",
    "monetarist and free market economist",
    "economist and government adviser",
    "economist who examined f endowment",
    "economist and banking official",
    "economist and government advisor",
    "Marxian economist and a Trotskyist activist and",
    "economist and government minister",
    "economist and Nobel laureate",
    "macroeconomist",
    "monetary economist",
    "economist and government official",
    "public servant and economist",
    "labor economist",
    "economist and communist",
    "economist who did pioneering research in linear",
    "development economist and",
    "Marxist economist",
    "political scientist and economist",
    "Gandhian economist",
    "aristocrat and economist",
    "economist and PZPR activist",
    "economist and political scientist",
    "economist and politologist",
    "economist and taxpayer activist",
    "administrator and economist",
    "economist and government policy advisor",
    "economist and policy adviser",
    "economist and social activist",
    "economist and political adviser",
    "economist and lobbyist",
    "economist and laureate of the Nobel Memorial Prize in Economic Sciences",
    "political economist and activist",
    "lawyer and economist",
    "economist and Nobel Prize laureate",
    "Marxian economist",
    "development economist",
    "civil servant and economist",
    "economist and political activist",
    "feminist economist",
    "economist and public servant",
    "Nobel Prize winning economist",
    "health economist",
    "agricultural economist",
    "and political economist",
    "political economist",
    "economist and an",
    "and economist",
    "economist and",
    "economist",
]

arts = []
sports = []
sciences = [
    "home economist",  # before politics_govt_law
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = ["convicted embezzler"]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [693]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [694]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['politics_govt_law'] ==1].sample(2)

CPU times: total: 28.5 s
Wall time: 28.6 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
53988,6,Sir John Lambert,", 94, British diplomat, Ambassador to Tunisia .",https://en.wikipedia.org/wiki/John_Lambert_(diplomat),5,2015,July,,,,Ambassador to,,,,,,,,,94.0,,United Kingdom of Great Britain and Northern Ireland,,1977 1981,1.791759,0,0,0,0,0,0,0,0,1,0,0,0,1
70929,23,Shantaram Potdukhe,", 86, Indian politician, MP .",https://en.wikipedia.org/wiki/Shantaram_Potdukhe,6,2018,September,,,,MP,,,,,,,,,86.0,,India,,1980 1996,1.94591,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [695]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 42456 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [697]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [699]:
# Code to check each value
roles_list.pop()

'judge'

<IPython.core.display.Javascript object>

In [826]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[[index for index in df.index if "judge" in df.loc[index, "info"]], "info_2",]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [825]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [827]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "judge and legal" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [828]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "circuit judge and tabloid columnist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [829]:
# Creating lists for each category
politics_govt_law = [
    "military judge and",
    "judge on the Supreme Court of Queensland",
    "judge on the Oregon Supreme Court",
    "senior judge for the Central District Court",
    "judge and Representative from Alabama",
    "appeals court judge",
    "High Court judge",
    "intellectual property lawyer and High Court judge",
    "judge of the ACT Supreme Court",
    "judge and hereditary peer",
    "judge and Law Lord",
    "circuit judge for the Court of Appeals for the Ninth Circuit",
    "senior judge of the District Court for the Southern District of",
    "judge and feminist",
    "judge of the District Court for the Western District of Missouri",
    "judge and public servant",
    "District Court judge",
    "Bankruptcy Court judge",
    "who was the first female Supreme Court judge",
    "State judge and prosecutor at the Nuremberg war crimes trials",
    "judge and influential patent attorney",
    "district judge overseeing desegregation in the South",
    "judge and former Lord Chief Justice",
    "judge and peer",
    "civil rights lawyer and the first female federal judge",
    "federal judge who crafted the mass settlement of asbestos lawsuits",
    "senior federal judge and the first black federal prosecutor in history",
    "judge on the Court of Appeals for the Third Circuit",
    "prominent judge sitting in highest court",
    "former chief judge of the Court of Appeals for the Third Circuit",
    "Superior Court judge who presided over the Charles Manson trial",
    "and Ohio judge for years",
    "judge and Vice Chancellor of the Supreme Court",
    "City family court judge and first female judge",
    "senior judge of the Family Division of the High Court",
    "former chief judge",
    "senior judge of the District Court for the Southern District of Alabama and judge for the Middle District of Alabama",
    "senior federal appellate judge",
    "civil rights activist and judge",
    "first female judge of",
    "judge in the",
    "judge and chairperson of the Electoral Commission",
    "judge and political activist",
    "judge and anti apartheid activist",
    "lawyer and Supreme Court judge",
    "judge and disability rights campaigner",
    "senior judge of the Court of Appeals for the Ninth Circuit",
    "jurist and judge",
    "judge and independence activist",
    "senior judge of the District Court for the District of New",
    "attorney and tribal judge",
    "judge and prosecutor",
    "judge and civil servant",
    "judge and ombudsman",
    "jurist and Supreme Court judge",
    "senior federal judge",
    "lawyer and state judge",
    "colonial official and judge",
    "judge of the High Court of and",
    "Navajo judge",
    "senior and chief judge",
    "legislator and federal judge",
    "senior circuit judge",
    "judge and legal",
    "judge and barrister",
    "judge and law lord",
    "district judge and",
    "district judge",
    "judge and life peer",
    "chief judge",
    "senior judge of the District Court for the Eastern District of",
    "attorney and judge",
    "district court judge",
    "judge and lawyer",
    "judge and jurist",
    "state judge",
    "Supreme Court judge",
    "barrister and judge",
    "senior judge",
    "lawyer and judge",
    "federal judge and",
    "federal judge",
    "circuit judge and",
    "judge and",
    "and judge",
    "judge",
]

arts = []
sports = [
    "dog show judge",  # before politics_govt_law
    "boxing judge and",
    "draughts player and judge",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [830]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "sports": sports,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [831]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['politics_govt_law'] ==1].sample(2)

CPU times: total: 47.7 s
Wall time: 47.7 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
73511,28,Norma Paulus,", 85, American lawyer and politician, Oregon Secretary of State , complications from dementia.",https://en.wikipedia.org/wiki/Norma_Paulus,37,2019,February,,,,Oregon Secretary of State,complications from dementia,,,,,,,,85.0,,United States of America,,1977 1985,3.637586,0,0,0,0,0,0,0,0,1,0,0,0,1
63136,12,Sheila Abdus-Salaam,", 65, American judge, member of the New York Court of Appeals , suicide by drowning.",https://en.wikipedia.org/wiki/Sheila_Abdus-Salaam,27,2017,April,since,,,member of the Court of Appeals,suicide by drowning,,,,,,,,65.0,,United States of America,,since 2013,3.332205,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [832]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 41858 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [834]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [896]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [895]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "military officer" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [894]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [897]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "coup leader" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [898]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "Karen military officer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [899]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = [
    "WWII military officer",
    "military officer and intelligence official",
    "Karen military officer",
    "Air Force military officer",
    "military officer and war veteran",
    "Resistance member and military officer",
    "military officer and resistance fighter",
    "military officer and National Hero of",
    "military officer of World War I and World War II",
    "and later military officer",
    "military officer and veteran affairs",
    "CIA paramilitary officer",
    "military officer and Hero of the Union",
    "military officer and coup leader",
    "military officer and pilot",
    "and military officer",
    "military officer and",
    "military officer",
]
spiritual = []
social = []
crime = [
    "human trafficker",
]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [833]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "writer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Code to check each specific value
# specific_roles_list.pop()

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
#### Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean7.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean7.db")
# df.to_sql("wp_life_expect_clean7", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 8 ]()