# Wikipedia Notable Life Expectancies
# [Notebook  9: Data Cleaning Part 8](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean8_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean7.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean7", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98059 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98057,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98058,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
70763,12,Gerald LaValle,", 86, American politician, member of the Pennsylvania Senate .",https://en.wikipedia.org/wiki/Gerald_LaValle,11,2018,September,,,,member of the Senate,,,,,,,,,86.0,,United States of America,,1990 2008,2.484907,0,0,0,0,0,0,0,0,1,0,0,0,1
43122,2,Ernie Field,", 70, English boxer, cancer.",https://en.wikipedia.org/wiki/Ernie_Field,23,2013,May,,,boxer,cancer,,,,,,,,,70.0,,United Kingdom of Great Britain and Northern Ireland,,,3.178054,0,0,0,0,0,0,0,0,0,0,0,0,0
24505,7,Jacques Hébert,", 84, Canadian politician, Senator .",https://en.wikipedia.org/wiki/Jacques_H%C3%A9bert_(Canadian_politician),4,2007,December,,,,Senator,,,,,,,,,84.0,,Canada,,1983 1998,1.609438,0,0,0,0,0,0,0,0,1,0,0,0,1
97912,29,Joel Moses,", 80, Israeli-American mathematician and computer scientist .",https://en.wikipedia.org/wiki/Joel_Moses,5,2022,May,Macsyma,,,,,,,,,,,,80.0,,Israel,United States of America,Macsyma,1.791759,1,0,0,0,0,0,0,0,0,0,0,0,1
18594,26,Marianna Komlos,", 35, Canadian bodybuilder, fitness model and professional wrestler, breast cancer.",https://en.wikipedia.org/wiki/Marianna_Komlos,14,2004,September,,,bodybuilder,fitness model and professional wrestler,breast cancer,,,,,,,,35.0,,Canada,,,2.70805,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98059 entries, 0 to 98058
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98059 non-null  object 
 1   name                       98059 non-null  object 
 2   info                       98059 non-null  object 
 3   link                       98059 non-null  object 
 4   num_references             98059 non-null  int64  
 5   year                       98059 non-null  int64  
 6   month                      98059 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98027 non-null  object 
 10  info_3                     48895 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "educator" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "sex educator" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [11]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [12]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "specialist in studies and educator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [13]:
# Creating lists for each category
politics_govt_law = [
    "struggle veteran",
]

arts = []
sports = []
sciences = [
    "and wonen health",
]

business_farming = []
academia_humanities = [
    "educator and scholar",
    "educator and Māori language proponent",
    "and early childhood educator",
    "educator and professor",
    "scholar and educator",
    "educator and public schools superintendent",
    "AIDS educator and",
    "AIDS educator",
    "educator and director of the Advanced Placement Program",
    "educator and librarian",
    "adult educator",
    "university educator",
    "linguist and educator",
    "librarian and educator",
    "specialist in studies and educator",
    "Native educator",
    "educator and debate coach",
    "language linguist and educator",
    "Sanskrit scholar and educator",
    "educator and musicologist",
    "educator and anthropologist",
    "teacher and educator",
    "educator and college administrator",
    "Inuit educator",
    "educator and university administrator",
    "translator and educator",
    "and educator",
    "educator and",
    "educator",
]
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [14]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [15]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['academia_humanities'] ==1].sample(2)

CPU times: total: 16 s
Wall time: 16 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
61725,17,Malcolm Peat,", 84, Canadian academic.",https://en.wikipedia.org/wiki/Malcolm_Peat,13,2017,January,,,,,,,,,,,,,84.0,,Canada,,,2.639057,0,0,0,1,0,0,0,0,0,0,0,0,1
20881,16,Richard P. McCormick,", 89, American historian, professor at Rutgers University, expert on early American political history and New Jersey history, illness.",https://en.wikipedia.org/wiki/Richard_P._McCormick,3,2006,January,,,,professor at Rutgers University,expert on early political history and New history,illness,,,,,,,89.0,,United States of America,,,1.386294,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [16]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 37846 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [17]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [18]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [19]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "chemist" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [20]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [21]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "houngan" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [22]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "chemist and astronaut candidate"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [23]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = []
sciences = [
    "theoretical chemist and Nobel Prize winner",
    "biochemist and molecular genetics pioneer",
    "native chemist",
    "biochemist and reproductive endocrinologist",
    "chemist and a winner of the Nobel Prize in Physics in",
    "chemist and Nobel Prize Laureate",
    "Nobel Prize winner in chemistry",
    "chemist and biologist",
    "biochemist and microbiologist",
    "nutritionist and biochemist",
    "neuroscience biochemist",
    "quantum chemist",
    "research chemist",
    "theoretical chemist and computer scientist",
    "research chemist and inventor",
    "chemist known for his work on the Manhattan Project",
    "biochemist and enzymologist",
    "scientist in the field of electrochemistry",
    "physical chemist at AT&T Bell Laboratories",
    "biochemist and Nobel Prize for Chemistry laureate",
    "industrial chemist best known for his work on polymers",
    "neurochemist and glycobiologist",
    "biochemist and virologist",
    "biochemist and recipient of the Nobel Prize in Physiology or Medicine",
    "physical and theoretical chemist",
    "chemist and mineralogist",
    "biochemist and cancer researcher",
    "biochemist and pharmacologist and recipient of the Nobel Prize in Physiology or Medicine",
    "nuclear chemist and recipient of the Nobel Prize in Chemistry",
    "chemist and co winner of Nobel Prize in Chemistry in",
    "botanist and chemist",
    "biochemist and protein crystallographer",
    "South wine chemist",
    "chemist and nuclear scientist",
    "oncologist and chemist",
    "chemist and science",
    "chemist and mass spectrometrist",
    "marine and freshwater chemist",
    "biochemist and molecular biologist",
    "protein chemist",
    "chemistry doctor",
    "Southern biochemist",
    "biochemist and developmental biologist",
    "biochemist and pharmacologist",
    "chemist and materials scientist",
    "biological chemist",
    "synthetic organic chemist",
    "atmospheric chemist",
    "chemist and researcher",
    "environmental chemist",
    "geochemist and planetary scientist",
    "physiologist and biochemist",
    "chemist and crystallographer",
    "scientist and agrochemist",
    "biochemist and physiologist",
    "biochemist and medical researcher",
    "textile chemist",
    "geologist and geochemist",
    "Congress chemist",
    "polymer chemist and inventor",
    "polymer chemist",
    "metallurgist and physical chemist",
    "chemist and astronaut candidate",
    "chemist and statistician",
    "cytologist and biochemist",
    "pharmacologist and biochemist",
    "natural product chemist",
    "chemist and Nobel laureate",
    "biochemist and Nobel Prize laureate",
    "biochemist and geneticist",
    "soil chemist and",
    "soil chemist",
    "biochemist and nutritionist",
    "pharmaceutical chemist",
    "Nobel Prize winning biochemist",
    "pharmacologist and chemist",
    "chemist and Nobel Prize laureate",
    "chemist and inventor",
    "biophysical chemist",
    "analytical chemist",
    "nuclear chemist",
    "electrochemist",
    "theoretical chemist",
    "inorganic chemist",
    "Nobel Prize winning chemist",
    "geochemist and",
    "geochemist",
    "physical chemist",
    "of organic chemistry and a",
    "of organic chemistry",
    "organic chemist and",
    "organic chemist",
    "of biochemistry",
    "and biochemist",
    "biochemist and",
    "biochemist",
    "of chemistry",
    "chemistry",
    "food chemist and",
    "chemist and",
    "and chemist",
    "chemist",
]

business_farming = []
academia_humanities = [
    "the first Rector Magnificus of the University of Twente",
]
law_enf_military_operator = []
spiritual = [
    "houngan",
]
social = []
crime = [
    "underground LSD chemist and",  # before sciences
]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [24]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "sciences": sciences,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [25]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 54.2 s
Wall time: 54.2 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
76938,28,Vitaly Voloshinov,", 72, Russian physicist.",https://en.wikipedia.org/wiki/Vitaly_Voloshinov,18,2019,September,,,,,,,,,,,,,72.0,,Russia,,,2.944439,1,0,0,0,0,0,0,0,0,0,0,0,1
33999,4,Alenush Terian,", 90, Iranian astronomer and physicist.",https://en.wikipedia.org/wiki/Alenush_Terian,8,2011,March,,,,,,,,,,,,,90.0,,Iran,,,2.197225,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [26]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 37258 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [27]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [28]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [29]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "rugby union player" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [30]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [31]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "rugby union player and administrator" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [32]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [33]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "rugby union player and administrator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [34]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "rugby union player and World War II fighter",
    "rugby union player and TV",
    "former rugby union player for and Neath RFC",
    "All Blacks rugby union player",
    "rugby league and rugby union player",
    "rugby union player and president of the Rugby Union",
    "rugby union player and national team captain",
    "boxer and rugby union player",
    "Hall of Fame rugby union player and coach",
    "rugby union player and selector",
    "rugby union player and manager",
    "rugby union player and administrator",
    "rugby union player and coach",
    "international rugby union player",
    "rugby union player for",
    "rugby union player and",
    "and rugby union player",
    "rugby union player",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [35]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [36]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 9.95 s
Wall time: 9.93 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
49371,15,James Cama,", 56, American martial artist and teacher.",https://en.wikipedia.org/wiki/James_Cama,9,2014,August,,,,,,,,,,,,,56.0,,United States of America,,,2.302585,0,0,0,0,0,0,1,0,0,0,0,0,1
21126,13,Jimmy Johnstone,", 61, Scottish football player, voted Celtic's best ever, motor neurone disease.",https://en.wikipedia.org/wiki/Jimmy_Johnstone,54,2006,March,,,,voted Celtic best ever,motor neurone disease,,,,,,,,61.0,,Scotland,,,4.007333,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [37]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36986 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [38]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [39]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [40]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "rugby league player" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [41]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [42]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "rugby league player and administrator" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [43]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "rugby league player involved in match fixing scandal"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [44]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "Hall of Fame rugby league player",
    "Papua New rugby league player",
    "rugby league player and captain",
    "rugby league player and international coach",
    "Hall of Fame rugby league player and national team captain",
    "football and rugby league player",
    "rugby league player for Wigan and Great",
    "rugby league player for Great and Hull KR",
    "rugby league player for Great",
    "rugby league player and referee",
    "rugby league player and administrator",
    "rugby union and rugby league player",
    "rugby league player and coach",
    "and rugby league player",
    "rugby league player and",
    "rugby league player",
]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = ["involved in match fixing scandal"]
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [45]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [46]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sports'] ==1].sample(2)

CPU times: total: 9.52 s
Wall time: 9.51 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
79926,14,Galen Head,", 72, Canadian ice hockey player .",https://en.wikipedia.org/wiki/Galen_Head,7,2020,March,Detroit Red Wings,,,,,,,,,,,,72.0,,Canada,,Detroit Red Wings,2.079442,0,0,0,0,0,0,1,0,0,0,0,0,1
5974,3,Rufe Gentry,", 79, American baseball player.",https://en.wikipedia.org/wiki/Rufe_Gentry,3,1997,July,,,,,,,,,,,,,79.0,,United States of America,,,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [47]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36706 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [48]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [49]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [50]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "sociologist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [51]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [52]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [index for index in df.index if "historical sociologist" in df.loc[index, "info"]]
# ]

<IPython.core.display.Javascript object>

In [53]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "sociologist and critic of systems analysis"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [54]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "caver",
]
sciences = [
    "sociologist and communication scientist",
    "sociologist and social justice",
    "sociologist and social",
    "sociologist and psychiatrist",
    "sociologist and critic of systems analysis",
    "sociologist of science",
    "criminologist and sociologist",
    "gerontologist and sociologist",
    "urban sociologist",
    "sociologist and patients' rights advocate",
    "sociologist and statistician",
    "environmental sociologist",
    "sociologist and demographer",
    "sociologist and criminologist",
    "sociologist from",
    "sociologist and",
    "sociologist",
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [55]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [56]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 9.84 s
Wall time: 9.83 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
15988,18,Virginia Heinlein,", 86, American chemist, biochemist and engineer.",https://en.wikipedia.org/wiki/Virginia_Heinlein,9,2003,January,,,,biochemist and engineer,,,,,,,,,86.0,,United States of America,,,2.302585,1,0,0,0,0,0,0,0,0,0,0,0,1
4777,29,Richard Duffin,", 87, American physicist.",https://en.wikipedia.org/wiki/Richard_Duffin,9,1996,October,,,,,,,,,,,,,87.0,,United States of America,,,2.302585,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [57]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36474 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [58]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [59]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [60]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "psychologist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [61]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [62]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "and pastoral psychologist" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [63]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [64]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "educational psychologist"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [65]:
# Creating lists for each category
politics_govt_law = ["children ombudsman"]

arts = []
sports = []
sciences = [
    "clinical psychologist and researcher",
    "psychologist renowned for his critical studies of hypnosis",
    "psychologist and expert in the work of Alfred Adler",
    "psychologist and sex researcher at Johns Hopkins University",
    "psychologist and methodologist",
    "psychologist known for his pioneering work in autism treatment",
    "psychologist and women reproductive health",
    "traffic psychologist",
    "psychologist and researcher",
    "psychologist and former head of the Psychological Association",
    "correctional psychologist and criminologist",
    "behavioral geneticist and psychologist",
    "psychologist and psychoanalyst",
    "psychologist at University",
    "existential psychologist",
    "psychologist who specialized in developmental psychology",
    "psychologist and a pioneer in experimental psychology",
    "child development psychologist",
    "development psychologist",
    "psychologist and statistician",
    "pediatric psychologist",
    "psychologist from the Antilles",
    "social psychologist and sexologist",
    "psychologist and sleep researcher",
    "psychologist and computer science researcher",
    "social psychologist and criminologist",
    "parapsychologist and ufologist",
    "mathematical psychologist and",
    "mathematical psychologist",
    "engineering psychologist",
    "psychiatrist and parapsychologist",
    "psychologist and anti divorce",
    "evolutionary psychologist",
    "sexologist and psychologist",
    "systems psychologist",
    "environmental psychologist",
    "psychologist and self help",
    "and pastoral psychologist",
    "organizational psychologist",
    "neuroscientist and psychologist",
    "child psychologist and",
    "child psychologist",
    "psychologist and parapsychologist",
    "experimental psychologist",
    "psychologist and neuroscientist",
    "parapsychologist and",
    "and parapsychologist",
    "parapsychologist",
    "neuropsychologist",
    "cognitive psychologist",
    "developmental psychologist",
    "clinical psychologist",
    "social psychologist",
    "behavioral psychologist",
    "Jungian psychologist and",
    "and psychologist",
    "psychologist and",
    "psychologist",
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [66]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [67]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

CPU times: total: 31.4 s
Wall time: 31.4 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
82178,10,Duilio Arigoni,", 91, Swiss chemist.",https://en.wikipedia.org/wiki/Duilio_Arigoni,4,2020,June,,,,,,,,,,,,,91.0,,Switzerland,,,1.609438,1,0,0,0,0,0,0,0,0,0,0,0,1
35517,18,Maurice M. Rapport,", 91, American neuroscience biochemist.",https://en.wikipedia.org/wiki/Maurice_M._Rapport,5,2011,August,,,,,,,,,,,,,91.0,,United States of America,,,1.791759,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [68]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 36147 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [70]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [482]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [481]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "engineer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [480]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [467]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "sound engineer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [464]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "recording engineer"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [477]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "Grammy Award winning sound engineer and music producer",
    "sound engineer and record producer",  # before sciences
    "car design engineer",
    "recording engineer and producer",
    "recording engineer and record producer",
    "Academy Award winning sound engineer",
    "sound engineer and founder of the BBC Radiophonic Workshop",
    "recording engineer and studio owner",
    "audio engineer and producer",
    "music producer and engineer",
    "record producer and audio engineer",
    "audio engineer and video game developer",
    "audio mastering engineer",
    "recording engineer and record label owner",
    "film recording engineer",
    "Grammy award winning music engineer",
    "record producer and recording engineer",
    "music engineer and producer",
    "and recording engineer",
    "film sound engineer",
    "audio engineer and record producer",
    "sound engineer",
    "recording engineer",
]
sports = []
sciences = [
    "electrical and radio engineer",
    "engineer and statistician",
    "electrical engineer and co founder",
    "videogame console engineer",
    "engineer and glider pilot",
    'engineer nicknamed the "Father of the Corvette "',
    "mechanical engineer and inventor",
    "integrated circuit engineer",
    "chemical engineer in unleaded gasoline",
    "computer hardware engineer",
    "electronics engineer for Sony",
    "diving engineer",
    "executive engineer",
    "electrical engineer and university",
    "electrical engineer and scientist",
    "and rocket engineer",
    "nuclear scientist and chemical engineer",
    "welding engineer",
    "scientist and aerospace engineer",
    "ceramic engineer",
    "chemical engineer and safety consultant",
    "audio engineer and electronics engineer",
    "chemical engineer and pharmaceutical executive",
    "mining engineer and geologist",
    "consulting engineer",
    "engineer and geotechnician",
    "engineering seismologist",
    "engineer in charge during the Chernobyl disaster",
    "pioneering computer engineer",
    "spacecraft engineer",
    "engineer and bullet train pioneer",
    "engineer and optician",
    "aircraft automotive engineer",
    "helicopter and aerospace engineer",
    "engineer and electronic computing pioneer",
    "rocket engineer who worked at the Jet Propulsion Laboratory",
    "aerospace engineer and a pioneer in helicopter design",
    "ice road engineer",
    "rail engineer",
    "engineer and aerospace executive",
    "engineer and co designer of NASA Apollo Lunar Module",
    "aerospace engineer and member of the",
    "communication satellites engineer",
    "aeronautics and astronautic engineer",
    "biomedical engineering pioneer",
    "traffic engineer and inventor of the mini roundabout",
    "chemical engineer for the Procter & Gamble company",
    "aeronautical engineer who invented the tandem rotor placement in helicopter design",
    "traffic engineer",
    "Toyota engineer",
    "scientist and electrical engineer",
    "inventor and chemical engineer",
    "engineer and cryptographer",
    "computer engineer and scientist",
    "aeronautical engineer and space scientist",
    "engineer and R&D executive",
    "civil engineer and industrial",
    "thermal engineer",
    "automotive executive and engineer",
    "geologist and earthquake engineer",
    "metallurgist and chemical engineer",
    "engineer and food scientist",
    "naval electronics engineer",
    "control systems engineer",
    "automotive engineer and executive",
    "ship engineer",
    "mechanical engineer and product",
    "explosives engineer and inventor",
    "aerospace engineer and fluid dynamicist",
    "nuclear material engineer",
    "plant breeding engineer",
    "marine engineer and executive",
    "industrial engineer and cycling",
    "video game engineer",
    "materials scientist and engineer",
    "engineering manager",
    "agronomist and engineer",
    "engineer and control theorist",
    "naval engineer and",
    "construction executive and civil engineer",
    "agricultural engineer and",
    "refrigeration engineer",
    "chemical engineer and scientist",
    "engineer and scientist",
    "ice drilling engineer",
    "ornithologist and engineer",
    "motorsport engineer",
    "computer engineer and NASA official",
    "mechanical engineer and anti nuclear power",
    "bridge structural engineer",
    "aircraft engineer",
    "soil mechanics engineer",
    "microwave electronics and communications engineer",
    "camera engineer",
    "engineer and rocket scientist",
    "NASA engineer and",
    "engineer and the second Director of Lockheed Skunk Works from to",
    "aeronautics engineer",
    "earthquake engineer",
    "computational engineer",
    "engineer and chief executive",
    "computer engineer and systems scientist",
    "engineer and inventor of the hovercraft",
    "railway signal engineer",
    "and railway engineer",
    "railway engineer",
    "optoelectronic engineer",
    "materials engineer",
    "auto engineer",
    "Hall of Fame computer engineer",
    "flight engineer",
    "airplane designer and engineer",
    "and race chassis engineer",
    "consulting nuclear engineer",
    "engineering construction executive",
    "engineering scientist",
    "systems engineer",
    "aeroelasticity engineer and aircraft designer",
    "engineer and technology executive",
    "biochemical engineer",
    "particle accelerator specialist and engineer",
    "hydrogeologist and environmental engineer",
    "port and harbor engineer",
    "rocket engineer",
    "aerospace engineer and NASA manager",
    "engineer and aircraft designer",
    "chemical engineer and inventor",
    "electrical engineer and inventor",
    "metallurgical engineer",
    "of aerospace engineering",
    "and aerospace engineer",
    "space engineer",
    "electronic engineer",
    "design engineer",
    "NASA engineer",
    "naval engineer",
    "petroleum engineer",
    "motorcycle engineer",
    "satellite engineer",
    "engineer and racing car designer",
    "biomedical engineer",
    "environmental engineer",
    "agricultural engineer",
    "irrigation engineer",
    "engineer and computer scientist",
    "industrial engineer",
    "bioengineer",
    "scientist and engineer",
    "acoustical engineer",
    "mining engineer and",
    "mining engineer",
    "aviation engineer",
    "hydraulic engineer",
    "software engineer",
    "audio engineer and inventor",
    "audio engineer and",
    "audio engineer",
    "telecommunications engineer",
    "electronics engineer and",
    "electronics engineer",
    "nuclear engineer",
    "computer engineer",
    "and automotive engineer",
    "automotive engineer",
    "structural engineer and",
    "structural engineer",
    "mechanical engineer and",
    "mechanical engineering",
    "mechanical engineer",
    "and aeronautical engineer",
    "aeronautical engineer",
    "aerospace engineer",
    "and chemical engineer",
    "chemical engineer and",
    "chemical engineer",
    "civil engineering",
    "and civil engineer",
    "civil engineer and",
    "civil engineer",
    "of electrical engineering",
    "electrical engineer and",
    "electrical engineer",
    "engineer and inventor",
    "and aircraft engineer",
    "engineer and space",
    "and test engineer",
    "and engineer for NASA",
    "transportation engineer and",
    "optical engineer",
    "inventor and engineer",
    "and cycling engineer",
    "broadcast engineer",
    "and aeroplane engineer",
    "Electrical engineering",
    "paper engineer and",
    "of engineering",
    "engineering",
    "and engineer",
    "engineer and",
    "engineer",
]

business_farming = ["co founder of Matchbox Toys"]
academia_humanities = []
law_enf_military_operator = ["Deputy Secretary of Defense"]
spiritual = []
social = []
crime = []
event_record_other = ["taken hostage in"]
other_species = []

<IPython.core.display.Javascript object>

In [448]:
# Hard-coding values to "sound engineer" which will correctly categorize entries in arts
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Bruce_Jackson_(audio_engineer)"
].index
df.loc[index, "info_2"] = "sound engineer"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Seth_Firkins"].index
df.loc[index, "info_2"] = "sound engineer"

<IPython.core.display.Javascript object>

In [465]:
# Hard-coding values to "audio engineer" which will correctly categorize entries in sciences
index = df[df["link"] == "https://en.wikipedia.org/wiki/Alan_R._Pearlman"].index
df.loc[index, "info_2"] = "audio engineer"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Bruno_Vanryb"].index
df.loc[index, "info_2"] = "audio engineer"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Dave_Smith_(engineer)"].index
df.loc[index, "info_2"] = "audio engineer"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [478]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "arts": arts,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['sciences'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [69]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "writer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Code to check each specific value
# specific_roles_list.pop()

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean8.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean8.db")
# df.to_sql("wp_life_expect_clean8", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 9 ]()