# Wikipedia Notable Life Expectancies
# [Notebook 15: Data Cleaning Part 14](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean14_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean13.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean13", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98044 rows and 37 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98042,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98043,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
46151,1,Pete DeCoursey,", 52, American political journalist, pancreatic and lung cancer.",https://en.wikipedia.org/wiki/Pete_DeCoursey,15,2014,January,,,pancreatic and lung cancer,,,,,,,,,52.0,,United States of America,,,2.772589,0,0,0,0,0,1,0,0,1,0,0,0,2
3581,19,Don Simpson,", 52, American film producer , heart failure.",https://en.wikipedia.org/wiki/Don_Simpson,33,1996,January,", ,",,heart failure,,,,,,,,,52.0,,United States of America,,", ,",3.526361,0,0,0,0,0,1,0,0,0,0,0,0,1
68327,30,Anna Chennault,", 92, Chinese-born American journalist and businesswoman.",https://en.wikipedia.org/wiki/Anna_Chennault,96,2018,March,,,,,,,,,,,,92.0,,"China, People's Republic of",United States of America,,4.574711,0,0,0,0,1,1,0,0,0,0,0,0,2
81394,3,K. S. Nissar Ahmed,", 84, Indian poet and writer, cancer.",https://en.wikipedia.org/wiki/K._S._Nissar_Ahmed,16,2020,May,,,cancer,,,,,,,,,84.0,,India,,,2.833213,0,0,0,0,0,1,0,0,0,0,0,0,1
65786,23,Iona Opie,", 94, British folklorist.",https://en.wikipedia.org/wiki/Iona_and_Peter_Opie,23,2017,October,,,,,,,,,,,,94.0,,United Kingdom of Great Britain and Northern Ireland,,,3.178054,0,0,0,1,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98044 entries, 0 to 98043
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98044 non-null  object 
 1   name                       98044 non-null  object 
 2   info                       98044 non-null  object 
 3   link                       98044 non-null  object 
 4   num_references             98044 non-null  int64  
 5   year                       98044 non-null  int64  
 6   month                      98044 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     23 non-null     object 
 9   info_3                     48888 non-null  object 
 10  info_4                     10264 non-null  object 
 11  info_5                     1265 non-null   object 
 12  info_6                     181 non-null    object 
 13  info_7                     27 non-null     obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_1`

In [6]:
# # Obtaining values for column and their counts
# roles_list = df["info_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_1"].notna()].index
#             if "leader" in df.loc[index, "info_1"]
#         ],
#         "info_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [9]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [10]:
# # Example code to quick-check a specific entry
# df[df["info_1"] == "Jr"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [11]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = ["politician", "King of", "leader"]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = ["artist", "journalist", "choral conductor"]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "basketball player",
    "sports",
    "Olympic sprinter",
    "college basketball coach",
    "Olympic gymnast",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = ["social psychologist", "materials scientist"]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = ["businessman"]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = ["priest"]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = ["common chimpanzee", "Tree of the Year"]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [12]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_1`

In [13]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_1'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 8.62 s
Wall time: 8.63 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
7346,25,Clive Osborne,", 75, Australian politician.",https://en.wikipedia.org/wiki/Clive_Osborne,8,1998,March,,,,,,,,,,,,75.0,,Australia,,,2.197225,0,0,0,0,0,0,0,0,1,0,0,0,1
84512,6,Bunny Lee,", 79, Jamaican reggae producer.",https://en.wikipedia.org/wiki/Bunny_Lee,15,2020,October,,,,,,,,,,,,79.0,,Jamaica,,,2.772589,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [14]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 48 entries without any known_for category.


<IPython.core.display.Javascript object>

In [15]:
# Checking remaining values in info_1
df["info_1"].value_counts()

               19
early           2
Jules Engel     1
Jr              1
Name: info_1, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- The remaining `info_1` values are extraneous, so we can drop the column, then proceed to examining `info_3`.
- We anticipate `info_3` to contain predominantly `cause_of_death` information, so we will need to keep that in mind in our extraction.
- The approach we will take is:
    1. Search remaining `info_` columns for `known_for` if the entry does not yet have a category.  We will do this by obtaining the values, then extracting information for any entry that contains the value in that `info_` column
    2. Search `info_3` values only if the `info_3` value does not contain an upper case letter.  This decision is a compromise, knowing some additional categories will be lost for some entries, but to avoid extensive manual review of individual values.

#### Dropping `info_1`

In [16]:
# Dropping info_1
df.drop("info_1", axis=1, inplace=True)

<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_3`

In [17]:
# # Obtaining values for column and their counts
# index = df[df["num_categories"] == 0].index
# roles_list = df.loc[index, "info_3"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [18]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [19]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3"].notna()].index
#             if "futurist" in df.loc[index, "info_3"]
#         ],
#         "info_3",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [20]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [21]:
# # Example code to quick-check a specific entry
# df[df["info_3"] == "researcher and politician"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [22]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Director of the Office of Nuclear Reactor Regulation",
    "anti nuclear and anti war activist",
    "civil rights lawyer and politician",
    "lawyer and politician",
    "nation first indigenous barrister",
    "barrister and economist",
    "diplomat and barrister",
    "and barrister",
    "barrister politician",
    "barrister and peer",
    "barrister and",
    "barrister",
    "elected to Congress",
    "first Ixil woman elected to Congress",
    "advocate for rights of Holocaust survivors",
    "and conspiracy theory researcher",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "announcer and voice actor",
    "actor and leader of comedic group The Drifters",
    "comedian and actor land National Artist",
    "voice actor and television presenter",
    "sound effects artist and voice actor",
    "son of actors Margo and Eddie Albert",
    "film & TV actor; Motown executive",
    "actor and television personality",
    "stage and big screen movie actor",
    "newspaper editor and child actor",
    "benefactor of the Mondavi Center",
    "actor and television presenter",
    "first black actor to appear on",
    "actor and television producer",
    "production designer and actor",
    "Academy Award nominated actor",
    "theatre and television actor",
    "television and theatre actor",
    "voice actor and screenwriter",
    "actor and stand up comedian",
    "voice over artist and actor",
    "actor and radio personality",
    "bass player and voice actor",
    "stage and television actor",
    "theatre director and actor",
    "television and stage actor",
    "actor and theatre director",
    "film and television actor",
    "television and film actor",
    "actor and theatre manager",
    "filmmaker and voice actor",
    "actor and record producer",
    "actor and television host",
    "instrumentalist and actor",
    "voice actor and comedian",
    "make up artist and actor",
    "voice actor and director",
    "theatre actor and writer",
    "game show host and actor",
    "voice actor and narrator",
    "actor and acting",
    "actor and film director",
    "choreographer and actor",
    "film director and actor",
    "internet meme and actor",
    "actor and choreographer",
    "actor and jazz musician",
    "screenwriter and actor",
    "actor and screenwriter",
    "actor and TV presenter",
    "artist and child actor",
    "singer and voice actor",
    "voice actor and tenor",
    "actor and film critic",
    "songwriter and actor",
    "journalist and actor",
    "playwright and actor",
    "actor and politician",
    "actor and playwright",
    "politician and actor",
    "actor and radio host",
    "film and voice actor",
    "actor and songwriter",
    "actor and librettist",
    "puppeteer and actor",
    "guitarist and actor",
    "actor and Urdu poet",
    "actor and announcer",
    "sportsman and actor",
    "director and actor",
    "actor and director",
    "composer and actor",
    "actor and producer",
    "producer and actor",
    "comedian and actor",
    "musician and actor",
    "actor and stuntman",
    "actor and comedian",
    "actor and activist",
    "narrator and actor",
    "parodist and actor",
    "actor and baritone",
    "actor and sculptor",
    "actor and radio DJ",
    "actor and novelist",
    "actor and reporter",
    "actor and pianist",
    "actor and teacher",
    "TV host and actor",
    "writer and actor",
    "actor and singer",
    "actor and writer",
    "singer and actor",
    "author and actor",
    "actor and author",
    "dancer and actor",
    "actor and artist",
    "coach and actor",
    "poet and actor",
    "voice actor",
    "comic actor",
    "film actor",
    "and actor",
    "actor and",
    "leader of the Blue Note Orchestra and musician on early Tamla sessions",
    "percussionist and multi instrumental session musician",
    "patriarch of family of jazz musicians",
    "musician and television producer",
    "musician and performance artist",
    "musician and stand up comedian",
    "musicologist and jazz musician",
    "musician and radio personality",
    "concert promoter and musician",
    "musician and record producer",
    "musician and",
    "sound engineer and musician",
    "musician and music educator",
    "musician and blues producer",
    "musician and visual artist",
    "studio owner and musician",
    "musician and music critic",
    "musician and screenwriter",
    "writer and jazz musician",
    "zydeco and soul musician",
    "songwriter and musician",
    "actor and jazz musician",
    "and musician",
    "cartoonist and musician",
    "composer and a musician",
    "executive and musician",
    "musician and publisher",
    "musician and archivist",
    "presenter and musician",
    "blues and R&B musician",
    "musician and conductor",
    "composer and musician",
    "musician and composer",
    "musician and academic",
    "arranger and musician",
    "producer and musician",
    "and musician",
    "musician and producer",
    "musician and",
    "hype man and musician",
    "musician and",
    "musician and maestro",
    "musician and blogger",
    "musician and painter",
    "painter and musician",
    "writer and musician",
    "musician and writer",
    "singer and musician",
    "author and musician",
    "musician and singer",
    "musician and author",
    "and musician",
    "electronic musician",
    "musician and artist",
    "artist and musician",
    "musician and actor",
    "poet and musician",
    "session musician",
    "folk musician",
    "musician",
    "detective story writer and",
    "electronic musician",
    "nationalized writer and",
    "children book illustrator and painter",
    "painter and",
    "painter and magazine cartoonist",
    "painter and comic strip artist",
    "painter and graphic artist",
    "choreographer and painter",
    "painter and art collector",
    "photographer and painter",
    "calligrapher and painter",
    "painter and photographer",
    "illustrator and painter",
    "painter and illustrator",
    "painter and philosopher",
    "painter and art critic",
    "painter and printmaker",
    "painter and architect",
    "medallist and painter",
    "painter and sculptor",
    "composer and painter",
    "painter and",
    "musician and painter",
    "painter and stuntman",
    "painter and musician",
    "and painter",
    "painter and author",
    "author and painter",
    "poet and painter",
    "painter",
    "duettist with pianist Renée Morisset",
    "blues and jazz pianist and singer",
    "pianist and harpsichordist",
    "pianist and music teacher",
    "pianist and visual artist",
    "pianist and fashion model",
    "pianist and vocal coach",
    "pianist and songwriter",
    "songwriter and pianist",
    "pianist and conductor",
    "conductor and pianist",
    "pianist and",
    "pianist and composer",
    "composer and pianist",
    "organist and pianist",
    "pianist and",
    "arranger and pianist",
    "pianist and vocalist",
    "singer and pianist",
    "actor and pianist",
    "poet and pianist",
    "concert pianist",
    "jazz pianist",
    "pianist",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "social psychologist and behavioral",
    "and clinical psychologist",
    "child psychologist and",
    "parapsychologist and",
    "psychologist and parapsychologist",
    "psychologist and sexologist",
    "psychologist and",
    "psychologist",
    "a leading researcher on the links between lipids and fats and heart disease",
    "researcher into the use of organochloride pesticides",
    "neuroscience researcher and ufologist",
    "microbiologist and medical researcher",
    "HIV AIDS researcher and",
    "Fundação Oswaldo Cruz researcher",
    "medical researcher and",
    "known as a Bigfoot researcher",
    "researcher and haematologist",
    "researcher of nuclear fusion",
    "scientist and researcher",
    "researcher and inventor",
    "researcher and",
    "and researcher",
    "Big Bang researcher",
    "futurist",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "founder of Tanger Factory Outlet Centers",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "and later General Lieutenant",
    "Colditz prisoner",
    "paramilitary leader",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = ["supporter of Holocaust survivors"]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = ["mobster and convicted murderer", "mobster and"]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "USAID contractor kidnapped by al Qaeda",
    "only Holocaust survivor",
    "Holocaust survivor in Schindler List",
    "world oldest Holocaust survivor",
    "Holocaust survivor and",
    "and Holocaust survivor",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = [
    "trained Thoroughbred racehorse and broodmare",
    "trained Thoroughbred racehorse and sire",
    "trained Thoroughbred racehorse",
]
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [23]:
# Hard-coding info_4 for entry to capture known_for category
index = df[df["link"] == "https://en.wikipedia.org/wiki/Kung_Fu_(wrestler)"].index
df.loc[index, "info_4"] = "wrestler"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [24]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_3`

In [25]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_3'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 2min 39s
Wall time: 2min 39s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
52663,1,Peter Diamandopoulos,", 86, Greek-born American academic, President of Adelphi University .",https://en.wikipedia.org/wiki/Peter_Diamandopoulos,11,2015,April,,President of Adelphi University,,,,,,,,,86.0,,Greece,United States of America,1985 1997,2.484907,0,0,0,1,0,0,0,0,0,0,0,0,1
93247,9,Tim Johnston,", 80, British Olympic long distance runner .",https://en.wikipedia.org/wiki/Tim_Johnston_(runner),12,2021,October,,,,,,,,,,,80.0,,United Kingdom of Great Britain and Northern Ireland,,1968,2.564949,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [26]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 27 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_3`

In [27]:
# # Obtaining values for column and their counts
# index = df[df["num_categories"] == 0].index
# roles_list = df.loc[index, "info_3"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [28]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [29]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3"].notna()].index
#             if "brain cancer" in df.loc[index, "info_3"]
#         ],
#         "info_3",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [30]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [31]:
# # Example code to quick-check a specific entry
# df[(df["info_3"] == "brain cancer") & (df["num_categories"] == 0)]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [32]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "later pro life activist and",
    "later Sinn Féin activist",
    "Gerakan Party founding president",
    "geopolitical energy specialist",
    "nationalist and",
    "nationalist",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "actor",
    "researcher",
    "later known as Katherine Weld",
    "and later mezzo soprano",
    "first female writer awarded full press credentials at",
    "soul and funk singer songwriter and record producer",
    "member of the Nashville Songwriters Hall of Fame",
    "singer and songwriter in the country music genre",
    "songwriter and popularizer of the Chicken Dance",
    "food writer and broadcaster on public media",
    "writer and son of novelist Ernest Hemingway",
    "co writer of the Endangered Species Act of",
    "theatre and film director and screenwriter",
    "songwriter and Emmy winning documentarian",
    "radio and television presenter and writer",
    "travel writer and television personality",
    "screenwriter and",
    "and television screenwriter and producer",
    "a prolific writer of language text books",
    "non‑fiction writer and literary",
    "translator and writer of children books",
    "songwriter and radio and TV personality",
    "award winning Ontarian writer and poet",
    "songwriter and recording studio owner",
    "writer and television quiz contestant",
    "singer songwriter and record producer",
    "documentary film director and writer",
    "television director and screenwriter",
    "screenwriter and television producer",
    "songwriter and country music manager",
    "short story writer and screenwriter",
    "and science fiction writer",
    "singer songwriter and",
    "cookbook writer and television host",
    "and rules writer",
    "dialogue writer and music composer",
    "writer and multimedia fine artist",
    "classical music critic and writer",
    "script writer and cinematographer",
    "short story writer and playwright",
    "playwright and television writer",
    "commentator and writer",
    "scholar and writer of literature",
    "cookbook writer and",
    "esotericism writer and",
    "music writer and record producer",
    "journalist and television writer",
    "cinematographer and screenwriter",
    "writer and Pulitzer Prize winner",
    "children writer and illustrator",
    "short story writer and novelist",
    "harmonica player and songwriter",
    "songwriter and book illustrator",
    "cartoonist and writer",
    "and screenwriter",
    "guitarist and singer songwriter",
    "songwriter and father of Prince",
    "widow of science fiction writer",
    "writer and nightclub proprietor",
    "short story writer and essayist",
    "songwriter and record producer",
    "screenwriter and film director",
    "film director and screenwriter",
    "television writer and producer",
    "record producer and songwriter",
    "writer and newspaper publisher",
    "satirist and screenplay writer",
    "screenwriter and stage manager",
    "songwriter and literary critic",
    "scriptwriter and comics writer",
    "film producer and screenwriter",
    "documentarist and screenwriter",
    "film director and scriptwriter",
    "scriptwriter and film director",
    "singer songwriter and",
    "journalist and children writer",
    "travel writer and photographer",
    "screenwriter and drama teacher",
    "novelist and nonfiction writer",
    "and screenwriter",
    "and writer",
    "country singer and songwriter",
    "writer and stage practitioner",
    "songwriter and music producer",
    "music producer and songwriter",
    "and fiction writer",
    "comedy writer and playwright",
    "food technologist and writer",
    "singer songwriter and artist",
    "songwriter and film composer",
    "artist and television writer",
    "master of novices and writer",
    "war correspondent and writer",
    "writer and radio broadcaster",
    "dubber and singer songwriter",
    "journalist artist and writer",
    "and travel writer",
    "screenwriter and journalist",
    "screenwriter and",
    "playwright and screenwriter",
    "journalist and screenwriter",
    "writer and cultural manager",
    "writer and editor",
    "screenwriter and playwright",
    "screenwriter and impresario",
    "and screenwriter",
    "librettist and screenwriter",
    "stage and television writer",
    "screenwriter and dramaturge",
    "piano player and songwriter",
    "book illustrator and writer",
    "songwriter and entertainer",
    "writer and television host",
    "folk singer and songwriter",
    "historian and screenwriter",
    "screenwriter and",
    "script writer and director",
    "harmonicist and songwriter",
    "writer and literary critic",
    "television chef and writer",
    "writer and record producer",
    "script writer and producer",
    "magazine editor and writer",
    "writer and cinematographer",
    "screenwriter and producer",
    "producer and screenwriter",
    "director and screenwriter",
    "screenwriter and director",
    "screenwriter and novelist",
    "screenwriter and composer",
    "scriptwriter and lyricist",
    "novelist and screenwriter",
    "screenwriter and lyricist",
    "writer and film historian",
    "horror and fantasy writer",
    "public servant and writer",
    "writer and philanthropist",
    "bandleader and songwriter",
    "screenwriter and essayist",
    "graphic artist and writer",
    "screenwriter and feminist",
    "TV personality and writer",
    "songwriter and",
    "lyricist and screenwriter",
    "songwriter and journalist",
    "songwriter and bandleader",
    "songwriter and guitarist",
    "guitarist and songwriter",
    "screenwriter and actress",
    "film director and writer",
    "art collector and writer",
    "art gallerist and writer",
    "comics writer and artist",
    "writer and film producer",
    "writer and comics artist",
    "visual artist and writer",
    "arranger and songwriter",
    "songwriter and composer",
    "composer and songwriter",
    "songwriter and arranger",
    "editor and screenwriter",
    "writer and scriptwriter",
    "speech writer",
    "screenwriter and critic",
    "author and screenwriter",
    "screenwriter and editor",
    "music critic and writer",
    "songwriter and producer",
    "screenwriter and writer",
    "songwriter and",
    "producer and songwriter",
    "set designer and writer",
    "songwriter and promoter",
    "screenwriter and author",
    "songwriter and radio DJ",
    "vocalist and songwriter",
    "writer and photographer",
    "songwriter and comedian",
    "writer and broadcaster",
    "writer and commentator",
    "songwriter and actress",
    "writer and illustrator",
    "detective story writer",
    "typographer and writer",
    "numismatist and writer",
    "broadcaster and writer",
    "illustrator and writer",
    "writer and TV producer",
    "singer and songwriter",
    "writer and journalist",
    "journalist and writer",
    "songwriter and singer",
    "songwriter and author",
    "writer and cartoonist",
    "writer and biographer",
    "auctioneer and writer",
    "cartoonist and writer",
    "editor and songwriter",
    "story writer and poet",
    "poet and screenwriter",
    "historian and writer",
    "columnist and writer",
    "writer and publisher",
    "executive and writer",
    "publisher and writer",
    "cameraman and writer",
    "writer and filmmaker",
    "writer and publicist",
    "writer and editor of",
    "announcer and writer",
    "writer and performer",
    "writer and director",
    "writer and producer",
    "producer and writer",
    "director and writer",
    "comedian and writer",
    "designer and writer",
    "writer and sculptor",
    "writer and academic",
    "sculptor and writer",
    "poet and songwriter",
    "writer and essayist",
    "essayist and writer",
    "writer and comedian",
    "writer and novelist",
    "songwriter arranger",
    "composer and writer",
    "songwriter and poet",
    "writer and humorist",
    "lyricist and writer",
    "writer and satirist",
    "writer and reporter",
    "nationalized writer",
    "writer and composer",
    "short story writer",
    "writer and actress",
    "writer and curator",
    "actress and writer",
    "hostess and writer",
    "writer and editor",
    "singer and writer",
    "artist and writer",
    "songwriter winner",
    "writer and critic",
    "writer and artist",
    "dancer and writer",
    "television writer",
    "writer and author",
    "editor and writer",
    "singer songwriter",
    "potter and writer",
    "writer and model",
    "poet and writer",
    "writer and poet",
    "writer and cook",
    "writer and chef",
    "fiction writer",
    "and songwriter",
    "travel writer",
    "screenwriter",
    "scriptwriter",
    "chess writer",
    "songwriter",
    "and writer",
    "writer and",
    "art writer",
    "writer",
    "photographer and underwater filmmaker",
    "first female filmmaker from Odisha",
    "cinematographer and filmmaker",
    "illustrator and filmmaker",
    "broadcaster and filmmaker",
    "journalist and filmmaker",
    "publisher and filmmaker",
    "producer and filmmaker",
    "filmmaker and producer",
    "actress and filmmaker",
    "author and filmmaker",
    "filmmaker and author",
    "writer and filmmaker",
    "filmmaker and critic",
    "filmmaker and",
    "and filmmaker",
    "filmmaker",
    "adult film performer",
    "and documentary film director",
    "theatre and film director and screenwriter",
    "action choreographer and film director",
    "documentary film director and writer",
    "cinematographer and film director",
    "film director and film producer",
    "choreographer and film director",
    "screenwriter and film director",
    "film director and screenwriter",
    "art director and film director",
    "and film director and producer",
    "scriptwriter and film director",
    "film director and photographer",
    "film director and scriptwriter",
    "film director and film critic",
    "TV producer and film director",
    "film director and architect",
    "and film director",
    "film director and producer",
    "novelist and film director",
    "comedian and film director",
    "actress and film director",
    "film director and actress",
    "film director and writer",
    "film director and singer",
    "author and film director",
    "opera and film director",
    "film director and poet",
    "film director",
    "wife of ventriloquist Edgar Bergen and mother of actress Candice Bergen",
    "mother and manager of actress Brooke Shields",
    'star of Ice Capades and "B" actress in the s',
    "television and Oscar winning film actress",
    "musical theatre and television actress",
    "game show panelist and voice actress",
    "actress and television personality",
    "husband of actress Anjelica Huston",
    "television presenter and actress",
    "actress and television presenter",
    "voice actress and",
    "television and stage actress",
    "actress and theatre producer",
    "film and television actress",
    "and stage actress",
    "television and film actress",
    "voice actress and comedian",
    "actress and film producer",
    "film director and actress",
    "actress and film director",
    "actress and choreographer",
    "actress and drama teacher",
    "former adult film actress",
    "screenwriter and actress",
    "voice actress and singer",
    "pin up girl and actress",
    "actress and politician",
    "actress and journalist",
    "songwriter and actress",
    "journalist and actress",
    "and actress",
    "actress and filmmaker",
    "director and actress",
    "actress and composer",
    "actress and singer",
    "writer and actress",
    "dancer and actress",
    "singer and actress",
    "actress and writer",
    "author and actress",
    "actress and author",
    "poet and actress",
    "voice actress",
    "actress and",
    "graphic designer and sculptor",
    "sculptor and costume designer",
    "sculptor and photographer",
    "sculptor and light artist",
    "sculptor and art",
    "sculptor and illustrator",
    "sculptor and printmaker",
    "printmaker and sculptor",
    "sculptor and",
    "architect and sculptor",
    "and sculptor",
    "sculptor and",
    "engraver and sculptor",
    "designer and sculptor",
    "sculptor and",
    "writer and sculptor",
    "sculptor and writer",
    "singer and sculptor",
    "author and sculptor",
    "sculptor and poet",
    "sculptor",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = ["sumo wrestler"]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "considered one of the world pre eminent applied mathematicians",
    "theoretical physicist and mathematician",
    "and mathematician",
    "mathematician and computer scientist",
    "mathematician and",
    "mathematician and Nobel laureate",
    "statistician and mathematician",
    "mathematician",
    "helicopter designer",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "logician and philosopher",
    "professor at George Washington University",
    "Kildin Sámi teacher",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "last known surviving fighter of the Warsaw Ghetto Uprising",
    "first trainer of the Special Weapons and Tactics and first husband of actress Marilyn Monroe",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = ["Catholic pope"]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "victim of possible police interference and brutality",
    "oldest worker in the",
    "last living survivor of the Hindenburg disaster",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [33]:
# Hard-coding info_3 value for entry to avoid incorrect categorization
index = df[df["link"] == "https://en.wikipedia.org/wiki/Arthur_Gish"].index
df.loc[index, "info_3"] = "accident"

# Hard-coding info_3 value for entry to avoid incorrect categorization
index = df[df["link"] == "https://en.wikipedia.org/wiki/Stephanie_Booth"].index
df.loc[index, "info_3"] = "accident"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [34]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_3`

In [35]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_3'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 3min 29s
Wall time: 3min 29s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
51649,19,Arthit Kamlang-ek,", 89, Thai general, Supreme Commander of the Royal Thai Armed Forces .",https://en.wikipedia.org/wiki/Arthit_Kamlang-ek,4,2015,January,,Supreme Commander of the Royal Armed Forces,,,,,,,,,89.0,,Thailand,,1983 1986,1.609438,0,0,0,0,0,0,0,1,0,0,0,0,1
14877,5,Alex Watson,", 70, Australian rugby league player.",https://en.wikipedia.org/wiki/Alex_Watson_(rugby_league),5,2002,June,,,,,,,,,,,70.0,,Australia,,,1.791759,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [36]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 8 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_3`

In [39]:
# # Obtaining values for column and their counts
# index = df[df["num_categories"] == 0].index
# roles_list = df.loc[index, "info_3"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [50]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [41]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3"].notna()].index
#             if "actress" in df.loc[index, "info_3"]
#         ],
#         "info_3",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [49]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [48]:
# # Example code to quick-check a specific entry
# df[df["info_3"] == "actress"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [51]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = ["actress"]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [52]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_3`

In [53]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_3'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

CPU times: total: 594 ms
Wall time: 578 ms


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
21445,23,Ian Copeland,", 57, American music promoter and agent, older brother of Stewart Copeland of The Police, melanoma.",https://en.wikipedia.org/wiki/Ian_Copeland,11,2006,May,,older brother of Stewart Copeland of The Police,melanoma,,,,,,,,57.0,,United States of America,,,2.484907,0,0,0,0,0,1,0,0,0,0,0,0,1
43832,28,Thérèse Blondeau,", 99, French Olympic swimmer .",https://en.wikipedia.org/wiki/Th%C3%A9r%C3%A8se_Blondeau,3,2013,June,,,,,,,,,,,99.0,,France,,1936.0,1.386294,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [54]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 7 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We have found all of the first categories in `info_3`.
- We will proceed to rebuild `known_for_dict` for the next iteration searching `info_3` for additional categories for entries, if the value in `info_3` does not contain capitol letters.

#### Finding `known_for` Roles in `info_3`

In [68]:
# Obtaining values for column and their counts
roles_list = df["info_3"].value_counts(ascending=True).index.tolist()
roles_list = [item for item in roles_list if item.islower()]

<IPython.core.display.Javascript object>

In [106]:
# Code to check each value
roles_list.pop()

'throat cancer'

<IPython.core.display.Javascript object>

In [107]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [
            index
            for index in df[df["info_3"].notna()].index
            if "author" in df.loc[index, "info_3"]
        ],
        "info_3",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [109]:
# Viewing list sorted by descending length to copy to dictionary below and screen values
sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

['first Native to become a general authority of The Church of Jesus Christ of Latter day Saints',
 'wrote a political column for The Times and was the author of two books on Parliament',
 'author and two term president of the Southern Baptist Convention from to',
 'general authority of The Church of Jesus Christ of Latter day Saints',
 'known as one of the world leading authorities on cancer research',
 'BYU professor and prominent Latter day Saint author and lecturer',
 'music arranger and author of more than music instruction books',
 'co author of the Constitution of the Islamic Republic of',
 'inadvertently alerted authorities to the Profumo affair',
 'Deputy and author of the Declaration of Independence of',
 'author of "The Cone Gatherers" and "Fergus Lamont"',
 'prize winning author and historian of medieval and',
 'member of the House of Representatives and author',
 'evolutionary biologist and popular science author',
 'author and project designer in the Boy Scouts of',
 'and 

<IPython.core.display.Javascript object>

In [119]:
# Example code to quick-check a specific entry
df[df["info_3"] == "consultant and author"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
15259,23,Stafford Beer,", 75, British theorist, consultant and author, known for his work in operational research and management cybernetics.",https://en.wikipedia.org/wiki/Stafford_Beer,17,2002,August,,consultant and author,known for his work in operational research and management cybernetics,,,,,,,,75.0,,United Kingdom of Great Britain and Northern Ireland,,,2.890372,0,0,0,1,0,0,0,0,0,0,0,0,1
70050,25,Judith Appelbaum,", 78, American editor, consultant and author, ovarian cancer.",https://en.wikipedia.org/wiki/Judith_Appelbaum,14,2018,July,,consultant and author,ovarian cancer,,,,,,,,78.0,,United States of America,,,2.70805,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [110]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
        "co author of the Constitution of the Islamic Republic of",
        "Deputy and author of the Declaration of Independence of",
        "co author of the Short Doyle Mental Health Act",
        "leading authority on the Constitution of",
        "a leading authority on the Union and",





]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "wrote a political column for The Times and was the author of two books on Parliament",
    "author and lecturer",
    "music arranger and author of more than music instruction books",
    'author of "The Cone Gatherers" and "Fergus Lamont"',
    "prize winning author and",
    "and popular science author",
    "author and",
    'and author of the enduring song "Bésame mucho"',
    'author of the "Surudi Milli" national anthem',
    "audiobook reader and children book author",
    
    "television show host and cookbook author",
    "book author",
    
    "author of definitive work on Pride Purge",
    
    "author and television hostess on cuisine",
    
    
    "oral historian and best selling author",
    "author of atlases of fictional worlds",
    "presidential biographer and author of",
    
    "author of book about",
    
    "author on",
    "non fiction author and broadcaster",
    "and relationship author",
    "investigative reporter and author",
    "author and supporter of",
    'author of "Last Exit to Brooklyn"',
    "commentator and author",
    
    "illustrator and children author",
    "author and television presenter",
    
    "novelist and short story author",
    "author and television producer",
    "game author and font designer",
    "author of over children books",
    "radio program host and author",
    "radio personality and author",
    "author and artistic director",
    "author and radio broadcaster",
    "and children author",
    "author and harmonica player",
    "theater director and author",
    "costume designer and author",
    "author and television host",
    "record producer and author",
    "ballroom dancer and author",
    "literary critic and author",
    "author and garden designer",
    "author and public speaker",
    
    "author and rose authority",
    "author and talk show host",
    "horticulturist and author",
    "choreographer and author",
    "author and art collector",
    "author and film producer",
    "author and art",
    
    "author and radio amateur",
    
    "photographer and author",
    "author and illustrator",
    "author and news anchor",
    "news editor and author",
    "broadcaster and author",
    "author and broadcaster",
    "illustrator and author",
    "author and journalist",
    "author and playwright",
    "journalist and author",
    
    "consultant and author",
    "author and art critic",
    "cartoonist and author",
    "author and biographer",
    "playwright and author",
    "author and publisher",
    "publisher and author",
    "columnist and author",
    "author and memoirist",
    
    "author and dramatist",
    "author and columnist",
    "artist and author of",
    "academic and author",
    "educator and author",
    "author and academic",
    "author and lecturer",
    "diplomat and author",
    "author and educator",
    "feminist and author",
    "composer and author",
    "author and attorney",
    "author and explorer",
    "attorney and author",
    "author and humorist",
    "lecturer and author",
    "organist and author",
    "activist and author",
    "author and inventor",
    "author and novelist",
    "theorist and author",
    "director and author",
    "author and composer",
    "author and activist",
    "scholar and author",
    "author and speaker",
    "author and scholar",
    "author and teacher",
    "teacher and author",
    "artist and author",
    "lawyer and author",
    "author and singer",
    "priest and author",
    "author and lawyer",
    "critic and author",
    "author and editor",
    "authority on art",
    "author and model",
    "tutor and author",
    "author and judge",
    "poet and author",
    "aide and author",
    "spy and author",
    "co author of",
    "chess author",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
        "known as one of the world leading authorities on cancer research",
        "one of the world foremost authorities on",
        "co author of the broken windows theory",



]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
        "leading authority on the art of medieval",
        "authority on secret societies",
    "logician and textbook author",
        "authority on Native culture",
        "scholar and authority on",
        "authority on literature",



    


]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
        "first Native to become a general authority of The Church of Jesus Christ of Latter day Saints",
    'two term president of the Southern Baptist Convention from to',
        "general authority of The Church of Jesus Christ of Latter day Saints",
        "general authority of the LDS Church",



]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = [
    'project designer in the Boy Scouts of'
]
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
        "inadvertently alerted authorities to the Profumo affair",

]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_3'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

In [37]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_3`

In [None]:
# # Obtaining values for column and their counts
# index = df[df["num_categories"] == 0].index
# roles_list = df.loc[index, "info_3"].value_counts(ascending=True).index.tolist()

In [None]:
# # Code to check each value
# roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_3"].notna()].index
#             if "Teaching Hospital" in df.loc[index, "info_3"]
#         ],
#         "info_3",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_3"] == "MPL"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = []
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_3`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict

# Column to check
column = 'info_3'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Updating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking a sample of rows
df[df['num_categories']!=0].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows without a first category
print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean14.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean14.db")
# df.to_sql("wp_life_expect_clean14", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 15]()