# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part ](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean10_thanak_2022_08_01.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [3]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean9.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean9", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98057 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98055,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98056,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [5]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
57744,4,Ranjan Baindoor,", 66, Indian cricketer.",https://en.wikipedia.org/wiki/Ranjan_Baindoor,3,2016,April,,,,,,,,,,,,,66.0,,India,,,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1
81579,11,Thorkild Grosbøll,", 72, Danish Lutheran clergyman.",https://en.wikipedia.org/wiki/Thorkild_Grosb%C3%B8ll,8,2020,May,,,Lutheran clergyman,,,,,,,,,,72.0,,Denmark,,,2.197225,0,0,0,0,0,0,0,0,0,0,0,0,0
6791,15,Karsten Andersen,", 77, Norwegian conductor.",https://en.wikipedia.org/wiki/Karsten_Andersen,6,1997,December,,,,,,,,,,,,,77.0,,Norway,,,1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
61565,8,Abdulkadir Kure,", 60, Nigerian politician, Governor of Niger State .",https://en.wikipedia.org/wiki/Abdulkadir_Kure,6,2017,January,,,,Governor of State,,,,,,,,,60.0,,Nigeria,,1999 2007,1.94591,0,0,0,0,0,0,0,0,1,0,0,0,1
95608,24,Stevan K. Pavlowitch,", 88, Serbian-born British historian.",https://en.wikipedia.org/wiki/Stevan_K._Pavlowitch,9,2022,January,,,,,,,,,,,,,88.0,,Serbia,United Kingdom of Great Britain and Northern Ireland,,2.302585,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [6]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98057 entries, 0 to 98056
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98057 non-null  object 
 1   name                       98057 non-null  object 
 2   info                       98057 non-null  object 
 3   link                       98057 non-null  object 
 4   num_references             98057 non-null  int64  
 5   year                       98057 non-null  int64  
 6   month                      98057 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98025 non-null  object 
 10  info_3                     48897 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [47]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [46]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [45]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "translator" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [44]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [48]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and Bible translator" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [49]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "linguist and bible translator"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [50]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = []
sciences = []

business_farming = []
academia_humanities = [
    "and translator of philosophy and literature",
    "scholar and translator of literature",
    "translator and literature scholar",
    "translator and literary scholar",
    "translator of modern literature",
    "language scholar and translator",
    "linguist and bible translator",
    "medievalist and translator",
    "litterateur and translator",
    "sinologist and translator",
    "translator of literature",
    "translator and linguist",
    "linguist and translator",
    "teacher and translator",
    "scholar and translator",
    "and Bible translator",
    "literary translator",
    "translator and",
    "and translator",
    "translator",
]
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [51]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [53]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['academia_humanities'] ==1].sample(2)

CPU times: total: 10.2 s
Wall time: 10.2 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
86445,3,Oyewusi Ibidapo-Obe,", 71, Nigerian academic administrator, vice chancellor of the University of Lagos , COVID-19.",https://en.wikipedia.org/wiki/Oyewusi_Ibidapo-Obe,15,2021,January,,,,vice chancellor of the University of Lagos,COVID,,,,,,,,71.0,,Nigeria,,2000 2007,2.772589,0,0,0,1,0,0,0,0,0,0,0,0,1
59476,6,Don Welch,", 84, American poet and academic.",https://en.wikipedia.org/wiki/Don_Welch,3,2016,August,,,,,,,,,,,,,84.0,,United States of America,,,1.386294,0,0,0,1,0,1,0,0,0,0,0,0,2


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [54]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 28392 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [237]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [236]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [235]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "film" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [234]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [232]:
# Example code to quick-screen values that may overlap categories
df.loc[[index for index in df.index if "cultural researcher" in df.loc[index, "info"]]]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
74555,30,Robert R. Spitzer,", 96, American agricultural researcher and educator.",https://en.wikipedia.org/wiki/Robert_R._Spitzer,3,2019,April,,,agricultural researcher,,,,,,,,,,96.0,,United States of America,,,1.386294,0,0,0,1,0,0,0,0,0,0,0,0,1
93001,26,Kirill Razlogov,", 75, Russian film critic and cultural researcher.",https://en.wikipedia.org/wiki/Kirill_Razlogov,6,2021,September,,,film critic and cultural researcher,,,,,,,,,,75.0,,Russia,,,1.94591,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [231]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "censor" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [230]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "film subject and domestic abuse symbol"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [228]:
# Creating lists for each category
politics_govt_law = [
    "censor",
]

arts = [
    "documentary filmmaker and pioneer of public access television",
    "television documentary director and filmmaker",
    "film and television special effects designer",
    "Bollywood filmmaker and brother of Dev Anand",
    "rock tour organiser and film studio manager",
    "film musical arranger musical orchestrator",
    "film and television editor and director",
    "filmmaker and children book illustrator",
    "film critic and film festival director",
    "experimental filmmaker and glass maker",
    "film studio executive and talent agent",
    "film editor and Academy Award winner",
    "film and television costume designer",
    "BBC Northern broadcaster & filmmaker",
    "film critic and television presenter",
    "director and producer in film and TV",
    "television executive and filmmaker",
    "music director for Bollywood films",
    "film studio executive and producer",
    "filmmaker and television producer",
    "wildlife film maker and producer",
    "director for film and television",
    "underwater documentary filmmaker",
    "film documentarian and producer",
    "filmmaker and festival promoter",
    "graphic designer and filmmaker",
    "film and advertising executive",
    "film critic and radio producer",
    "film distributor and producer",
    "ʼNamgis documentary filmmaker",
    "cinematographer and filmmaker",
    "and film and theater director",
    "television and film executive",
    "film and television executive",
    "film and television producer",
    "Arabian film and TV director",
    "film and stage choreographer",
    "stage director and filmmaker",
    "choreographer and filmmaker",
    "music documentary filmmaker",
    "film critic and researcher",
    "independent film executive",
    "film and television editor",
    "columnist and film critic",
    "Oscar winning film editor",
    "and documentary filmmaker",
    "film and theater director",
    "and documentary filmmaker",
    "documentary filmmaker and",
    "film and theatre producer",
    "film and theater producer",
    "film and theatre director",
    "film production designer",
    "music and film executive",
    "producer of horror films",
    "and underwater filmmaker",
    "film critic and essayist",
    "film television producer",
    "film marketing publicist",
    "film critic and producer",
    "film editor and producer",
    "and aerial film operator",
    "film editor and director",
    "film critic and director",
    "film industry executive",
    "film and opera director",
    "theater and film critic",
    "film and stage director",
    "filmmaker and cameraman",
    "Emmy Award winning film",
    "documentary film editor",
    "Tony Award winning film",
    "film and theatre critic",
    "theatre and film critic",
    "documentary film maker",
    "experimental filmmaker",
    "filmmaker and director",
    "animator and filmmaker",
    "film critic for on ABC",
    "independent film maker",
    "pornographic filmmaker",
    "filmmaker and designer",
    "producer and filmmaker",
    "filmmaker and producer",
    "documentary filmmaker",
    "film studio executive",
    "advertising filmmaker",
    "avant garde filmmaker",
    "music and film critic",
    "film costume designer",
    "independent filmmaker",
    "film  television host",
    "film and TV director",
    "surrealist filmmaker",
    "film and TV producer",
    "filmmaker and editor",
    "film camera operator",
    "film music director",
    "film location scout",
    "film stunt director",
    "wildlife filmmaker",
    "film prop designer",
    "film choreographer",
    "film distributor",
    "of film studies",
    "film programmer",
    "South filmmaker",
    "and film critic",
    "adult film star",
    "film editor and",
    "film critic and",
    "film executive",
    "film trumpeter",
    "and filmmaker",
    "filmmaker and",
    "film lyricist",
    "film pioneer",
    "film editor",
    "film critic",
    "film maker",
    "filmmaker",
    "film star",
    "film and",
    "and film",
    "film",
]
sports = []
sciences = [
    "restorer",
    "virtual reality technology pioneer and",
]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = ["Anglican prelate and theologian", "Anglican prelate"]
social = []
crime = []
event_record_other = [
    "ALD patient portrayed in the film",  # before arts
    "filmgoer",
    "film subject and domestic abuse symbol",
]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [229]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "sports": sports,
    "arts": arts,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [238]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['spiritual'] ==1].sample(2)

CPU times: total: 1min 35s
Wall time: 1min 35s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
88862,1,Rayappu Joseph,", 80, Sri Lankan Roman Catholic prelate, bishop of Mannar .",https://en.wikipedia.org/wiki/Rayappu_Joseph,11,2021,April,,,,bishop of Mannar,,,,,,,,,80.0,,Sri Lanka,Italy,1992 2016,2.484907,0,0,1,0,0,0,0,0,0,0,0,0,1
37277,4,Giovanni Volta,", 83, Italian Roman Catholic prelate, Bishop of Pavia .",https://en.wikipedia.org/wiki/Giovanni_Volta,9,2012,February,,,,Bishop of Pavia,,,,,,,,,83.0,,Italy,Italy,1986 2003,2.302585,0,0,1,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [239]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 27795 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [465]:
# # Obtaining values for column and their counts
# roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [464]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [463]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "professor" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [462]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [466]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "and communication professor" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [467]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "professor and World War II researcher"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [468]:
# Creating lists for each category
politics_govt_law = [
    "who helped uncover the Bay of Pigs Invasion plan",
    "child protection expert",
]

arts = []
sports = []
sciences = [
    "who made critical contributions to the development of radar",
]

business_farming = []
academia_humanities = [
    "professor and official pronouncer of the Scripps National Spelling Bee from to",
    "geographer and Alexander von Humboldt professor of geography at UCLA",
    "professor and twice interim president of the University of Missouri",
    "professor of and Islamic Studies at the University of Edinburgh",
    "linguistics professor and Pacific Islands language specialist",
    "professor of education and commentator on education topics",
    "professor at Columbia University and scholar of literature",
    "professor of education at the University of Washington",
    "professor and leading researcher into category theory",
    "professor of aesthetics at University of Strasbourg",
    "and drama professor at the Academy of Theatre Arts",
    "professor of Assyriology and Babylonian literature",
    "professor at Princeton Theological Seminary and",
    "professor of history at Indiana University",
    "professor of History at University College",
    "classical scholar and history professor",
    "ist and professor of ancient languages",
    "and professor at Seton Hall University",
    "professor at the University of Chicago",
    "professor and World War II researcher",
    "professor at Brigham Young University",
    "professor and folklorist of cultures",
    "and professor of clinical psychology",
    "professor of comparative literature",
    "scholar and professor of literature",
    "professor specialized in turbulence",
    "and political philosophy professor",
    "professor of modern Jewish history",
    "professor and daughter of Zhu De",
    "professor at Stanford University",
    "professor at Columbia University",
    "professor of Ancient Philosophy",
    "professor of Early Christianity",
    "professor of Jewish literature",
    "professor of Hebrew Literature",
    "and professor at University of",
    "professor at the University of",
    "Stanford University professor",
    "professor emeritus of history",
    "professor of at University of",
    "anthropologist and professor",
    "professor of and runologist",
    "and communication professor",
    "and professor of philosophy",
    "researcher and professor of",
    "Assyriologist and professor",
    "emeritus professor at Yale",
    "professor at University of",
    "professor of Asian studies",
    "professor of philosophy of",
    "professor of women studies",
    "library science professor",
    "ethnologist and professor",
    "folklorist and professor",
    "and philosophy professor",
    "geographer and professor",
    "A&M University professor",
    "professor and sinologist",
    "and University professor",
    "and university professor",
    "librarian and professor",
    "University of professor",
    "professor of Egyptology",
    "women studies professor",
    "pedagogue and professor",
    "professor of philosophy",
    "professor of literature",
    "linguist and professor",
    "professor of geography",
    "and Emeritus professor",
    "and professor emeritus",
    "assistant professor of",
    "professor of Classics",
    "professor emeritus of",
    "scholar and professor",
    "professor of rhetoric",
    "professor of classics",
    "and college professor",
    "and professor of law",
    "literature professor",
    "professor of Studies",
    "professor of studies",
    "professor of history",
    "philosophy professor",
    "university professor",
    "professor of Hebrew",
    "associate professor",
    "professor emeritus",
    "language professor",
    "and law professor",
    "law professor and",
    "college professor",
    "history professor",
    "and a professor",
    "law professor",
    "MIT professor",
    "and professor",
    "professor and",
    "professor of",
    "professor in",
    "professor",
]
law_enf_military_operator = []
spiritual = [
    "expert on biblical manuscripts",
]
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [469]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['academia_humanities'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [240]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2"].notna()].index
#             if "general" in df.loc[index, "info_2"]
#         ],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Code to check each specific value
# specific_roles_list.pop()

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean10.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean10.db")
# df.to_sql("wp_life_expect_clean10", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 11]()