# Wikipedia Notable Life Expectancies
# [Notebook 6 : Data Cleaning Part 5](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean5_thanak_2022_07_17.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean4.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean4", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98060 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,economist,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98058,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,journalist,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,0,0,0,1,0,0,0,1
98059,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
75719,12,Fernando J. Corbató,", 93, American computer scientist, developer of Multics, complications from diabetes.",https://en.wikipedia.org/wiki/Fernando_J._Corbat%C3%B3,8,2019,July,,,computer scientist,developer of Multics,complications from diabetes,,,,,,,,93.0,,United States of America,,,2.197225,0,0,0,0,0,0,0,0,0,0,0,0,0
22392,21,Hassan Gouled Aptidon,", 90, Dijiboutian first President, natural causes.",https://en.wikipedia.org/wiki/Hassan_Gouled_Aptidon,10,2006,November,,,first President,natural causes,,,,,,,,,90.0,,Djibouti,,,2.397895,0,0,0,0,0,0,0,0,0,0,0,0,0
97284,18,Barbara Hall,", 99, British crossword compiler and advice columnist.",https://en.wikipedia.org/wiki/Barbara_Hall_(editor),10,2022,April,,,crossword compiler and advice columnist,,,,,,,,,,99.0,,United Kingdom of Great Britain and Northern Ireland,,,2.397895,0,0,0,0,0,0,0,0,0,0,0,0,0
88324,9,Shuichi Murakami,", 70, Japanese jazz drummer.",https://en.wikipedia.org/wiki/Shuichi_Murakami,3,2021,March,,,jazz drummer,,,,,,,,,,70.0,,Japan,,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0
14705,30,Goo Arlooktoo,", 38, Canadian politician, heart attack.",https://en.wikipedia.org/wiki/Goo_Arlooktoo,7,2002,April,,,,heart attack,,,,,,,,,38.0,,Canada,,,2.079442,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98060 entries, 0 to 98059
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98060 non-null  object 
 1   name                       98060 non-null  object 
 2   info                       98060 non-null  object 
 3   link                       98060 non-null  object 
 4   num_references             98060 non-null  int64  
 5   year                       98060 non-null  int64  
 6   month                      98060 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98028 non-null  object 
 10  info_3                     48895 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting `known_for` values by creating `known_for_dict_7`

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# # Code to check each value
# roles_list.pop()

''

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "writer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [842]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [843]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [844]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [845]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [848]:
# Creating lists for each category
politics_govt_law = [
    "former executive editor of Richard Nixon and speech",
    "and free speech advocate",
    "freedom of speech advocate",
    "lawyer and free speech activist",
    "civil rights activist and speech",
    "and speech",
    "political speech",
    "and free speech activist",
    "speech",
    "Sestigers",
]

arts = [
    "rock and roll singer songwriter",
    "writer and radio dramatist",
    "writer of Berserker series",
    "writer and fashion editor",
    "comedy writer and lyricist",
    "writer and novelist in the Kannada language",
    "television writer for and",
    "writer of books and modules",
    "science fiction and western writer",
    "folk blues singer songwriter",
    "songwriter and columnist",
    "children writer and comic strip cartoonist",
    "spy fiction writer",
    "Emmy winning television comedy writer",
    "political journalist and food writer",
    "writer of all episodes of the long running radio serial",
    "writer of descent",
    "artist and screenwriter",
    "wine writer",
    "thriller writer and James Bond continuation novelist",
    "jazz writer",
    "writer and beer expert",
    "jazz musician and comedy writer",
    "television writer blacklisted by the House Un Activities Committee",
    "Motown songwriter and record producer",
    "writer and radio producer",
    "comic book writer and editor for DC Comics",
    "Chukchi language writer",
    "writer and script editor",
    "Christian rock singer songwriter",
    "writer of detective fiction",
    "novelist and non fiction writer",
    "cookbook writer",
    "playwright and scriptwriter",
    "motion picture writer director",
    "Broadway producer and writer",
    "film and television producer and screenwriter",
    "vocalist and songwriter",
    "country music manager and songwriter",
    "rock musician and songwriter",
    "classical guitarist and writer",
    "techno singer songwriter",
    "songwriter and drummer",
    "travel writer and broadcaster",
    "writer and national artist",
    "Emmy nominated television writer",
    "nonfiction writer",
    "copywriter and executive",
    "comics writer and newspaper editor",
    "film director and script writer",
    "literary agent and writer",
    "writer of screenplays",
    "Braziliam writer and journalist",
    "co writer of the s national anthem",
    "songwriter and folksinger",
    "writer and radio broadcaster",
    "writer and journalist on",
    "novelist and soap opera writer",
    "country western songwriter",
    "libertarian writer and",
    "TV writer",
    "gardening writer",
    "architectural critic and writer",
    "animation writer producer",
    "and chess writer",
    "producer and writer of TV comedies",
    "lead singer songwriter musician of rock band God Lives Underwater",
    "resident writer and film critic",
    "writer and puppeteer for",
    "long time Associated Press cuisine writer",
    '"New Wave" vallenato singer and songwriter',
    "jazz pianist and film music writer",
    "R&B songwriter and record producer",
    "dramatist and writer",
    "writer editor publisher",
    "writer and Nobel Prize winner",
    "mystery novel writer",
    "Cree writer",
    "film producer and writer",
    "writer and architectural conservationist",
    "pop singer and writer",
    "poet writer",
    "writer and art critic",
    "experimental writer",
    "writer and wargame designer",
    "writer and news presenter",
    "songwriter and harpist",
    "writer and sound",
    "writer and music journalist",
    "potter and writer",
    "silent film screenwriter",
    "illustrator and writer",
    "sitcom writer",
    "percussionist and songwriter",
    "rock keyboardist and songwriter",
    "writer and speaker",
    "rhythm and blues and jazz musician and songwriter",
    "radio writer",
    "writer and television director",
    "writer and television personality",
    "and technical writer",
    "television writer and director",
    "short story and travel writer",
    "Motown songwriter",
    "psychedelic rock songwriter and musician",
    "Lakota writer and",
    "Romani writer",
    "BBC Radio writer",
    "manga artist and writer",
    "musician and score writer",
    "game show writer",
    "film producer and television screenwriter",
    "poet and writer in Gujarati language",
    "comedian and television writer",
    "writer and book editor",
    "television comedy writer and producer",
    "comic book writer and music critic",
    "writer and magician",
    "anime writer and producer",
    "writer and lyricist",
    "detective writer",
    "television and radio writer",
    "fiddler and singer songwriter",
    "Emmy Award winning screenwriter",
    "cook and food writer",
    "writer and investigative journalist",
    "singer songwriter and dancer",
    "Academy Award nominated playwright and screenwriter",
    "First Nations singer songwriter",
    "cookery and travel writer",
    "songwriter and radio talk show host",
    "television host and writer",
    "Emmy Award winning television writer",
    "rockabilly musician and songwriter",
    "film and TV producer and writer",
    "Grammy winning songwriter",
    "jazz organist and songwriter",
    "copywriter and product spokesperson",
    "speaker and writer",
    "painter and television screenplay writer",
    "comic strip writer and editor",
    "screenplay writer",
    "writer and editor of science fiction and fantasy",
    "writer and television journalist",
    "TV producer and writer",
    "Academy Award nominated screenwriter",
    "script writer and producer",
    "Urdu language writer",
    "musician and songwriter for Guns N' Roses",
    "songwriter and music promoter",
    "mystery writer and novelist",
    "writer and small press printer",
    "writer and theatre critic",
    "and poet and screenwriter",
    "writer and music executive",
    "playwright and writer",
    "poet and prose writer",
    "songwriter and television producer",
    "author and ghostwriter",
    "folksinger and songwriter",
    "writer under the pseudonym",
    "comic books illustrator and writer",
    "writer and Pulitzer Prize winner",
    "folk music songwriter and musician",
    "television scriptwriter",
    "songwriter and music manager",
    "writer and surrealist poet",
    "R&B and blues singer and songwriter",
    "short story writer and scriptwriter",
    "writer and Nobel Prize laureate",
    "comedy writer and radio and television personality",
    "writer and expert on country houses",
    "Pulitzer Prize winning war correspondent and writer",
    "writer and science fiction author",
    "novelist and a mystery writer",
    "Western writer",
    "writer and raconteur",
    "tenor and songwriter",
    "singer poet and songwriter",
    "folk music singer songwriter",
    "writer of Westerns",
    "ballet and dance writer",
    "author and the wife of writer Dylan Thomas",
    "and film director and screenwriter",
    "writer and humorist",
    "composer and writer on music",
    "lyricist and songwriter",
    "rockabilly and country singer and songwriter",
    "writer and only child author Jack Kerouac and Joan Haverty Kerouac",
    "Poet; writer",
    "humorist and writer",
    "Urdu poet and screenwriter",
    "director and scriptwriter",
    "playwright and screenwriter specialising in comedies",
    "writer of crime fiction",
    "film critic and travel writer",
    "writer who won the Newbery Medal",
    "writer of short stories and poetry",
    "editor and writer on music and dance",
    "songwriter and impresario",
    "journalist and writer on music and",
    "writer and film",
    "songwriter and session musician",
    "screenwriter who won the Academy Award for Best Story for",
    "and writer on the language",
    "gospel bass singer and songwriter",
    "folk rock singer songwriter and film producer",
    "producer and film and television writer",
    "screenwriter and World War II foreign correspondent",
    "writer of fiction and screenplays",
    "songwriter and jazz pianist",
    "architectural photographer and writer",
    "writer of literature",
    "; also a world renowned poet and writer",
    "singer songwriter and guitarist of Big Country and The Raphaels",
    "comic strip and comic book writer and cartoonist",
    "Sami writer",
    "country and rockabilly singer",
    "born singer songwriter",
    "country music songwriter and record producer",
    "writer of the Old West",
    "Arabian writer of Arab descent",
    "rockabilly songwriter and singer",
    "and writer on Old Testament issues",
    "Blackfeet and Gros Ventre writer and poet",
    "newspaper reporter and writer",
    "comic book writer artist",
    "R&B musician songwriter",
    "agent for writers and cartoonists",
    "writer of stories and news articles",
    "singer songwriter and playwright",
    "writer of more than children books",
    "classical music broadcaster and writer",
    "entertainer and songwriter",
    "writer and illustrator of children books",
    "bebop and jazz pianist and songwriter",
    "television and film producer and screenwriter",
    "popular singer and songwriter",
    "screenwriter and producer for television",
    "Creole zydeco musician and songwriter",
    "writer of drama and short stories",
    "juvenile literature writer",
    "short story writer",
    "novelist and writer for children",
    "writer of comic books and strips",
    "editorial writer",
    "rhythm and blues and jazz singer and songwriter",
    "R&B musician and songwriter",
    "writer and director of plays and films",
    "short story writer and essayist",
    "jazz writer and record producer",
    'writer also known as ""',
    "reggae singer songwriter",
    "screenwriter and novelist",
    "and freethought writer",
    "comic book writer for DC Comics",
    "writer and movie director",
    "writer of pulp fiction",
    "media personality and writer",
    "songwriter and bass guitarist",
    "jazz and calypso musician and songwriter",
    "fiction writer and essayist",
    "poet and short story writer",
    "screenwriter and filmmaker",
    "novelist and story writer",
    "children author and writer",
    "comic strip writer and artist",
    "author and screenplay writer",
    "novelist and radio script writer",
    "writer from Nadu",
    "writer and music producer",
    "rock and roll singer and songwriter",
    "telenovela screenwriter",
    "comedy writer and theater owner",
    "cartoonist and comics writer and artist",
    "food writer and broadcaster",
    "documentary film director and screenwriter",
    "cartoonist and comic book writer",
    "journalist and culture writer",
    "rock singer and songwriter",
    "TV writer and producer",
    "comics writer and artist",
    "film director screenwriter and producer",
    "writer and masters",
    "jazz guitarist and songwriter",
    "nomadic lifestyle artist and writer",
    "writer and artist model",
    "bookseller and writer",
    "singer songwriter and keyboardist",
    "chef and cookbook writer",
    "poet and fiction writer",
    "music producer and songwriter",
    "Inuktitut pop singer and songwriter",
    "travel writer and journalist",
    "Guianese writer",
    "writer and golf historian",
    "film director and short story writer",
    "reggae singer and songwriter",
    "Hall of Fame country singer songwriter and musician",
    "troubadour and dialect writer",
    "designer and writer",
    "writer and dramaturge",
    "TV writer and novelist",
    "folk rock singer and songwriter",
    "and screenwriter and playwright",
    "writer and performer",
    "novelist and children writer",
    "Inuk writer",
    "songwriter and music publisher",
    "Hall of Fame blues rock singer songwriter",
    "writer and existentialist poet",
    "comedy screenwriter",
    "children illustrator and writer",
    "photographer and writer",
    "Hall of Fame science fiction writer",
    "soul singer songwriter",
    "novelist and television writer",
    "Hall of Fame guitarist and songwriter",
    "Hall of Fame country music singer songwriter",
    "jazz singer and songwriter",
    "singer songwriter and radio host",
    "blues singer songwriter and harmonicist",
    "youth writer",
    "musical theatre producer and writer",
    "red dirt singer songwriter",
    "biographer and children writer",
    "songwriter and television presenter",
    "writer and language",
    "radio presenter and writer",
    "theatrical and film critic",
    "newspaper editor and writer",
    "author and television writer",
    "house and gospel singer songwriter",
    "screenwriter and lyricist",
    "travel and food writer",
    "Gujarati writer and journalist",
    "prankster and writer",
    "pasillo and pasacalle singer songwriter",
    "comic book writer and publisher",
    "Hall of Fame pianist and singer songwriter",
    "journalist and writer publicist",
    "trumpeter and songwriter",
    "graphic artist and writer",
    "horn player and writer",
    "writer and singer songwriter",
    "singer songwriter and painter",
    "writer and humourist",
    "bassist and singer songwriter",
    "investigative journalist and writer",
    "Gujarati language writer and",
    "comics writer and illustrator",
    "film director and scriptwriter",
    "folk singer songwriter and journalist",
    "visual artist and writer",
    "documentary film director and writer",
    "writer and cabaretist",
    "television writer and playwright",
    "First Nations writer",
    "comic book writer and novelist",
    "photojournalist and writer",
    "R&B singer songwriter and musician",
    "media critic and writer",
    "writer and bel canto singer",
    "comic book editor and writer",
    "jazz pianist and songwriter",
    "Native writer and",
    "writer and journalism",
    "radio broadcaster and writer",
    "songwriter and poet",
    "writer and lay",
    "gardening writer and television broadcaster",
    "singer songwriter and writer",
    "death metal singer songwriter and guitarist",
    "telenovela writer",
    "writer and orator",
    "children writer and book editor",
    "cinematic historian and writer",
    "country singer songwriter and music executive",
    "singer songwriter and media director",
    "art collector and writer",
    "writer and visual artist",
    "southern soul singer songwriter",
    "singer songwriter and reality show contestant",
    "screenwriter and essayist",
    "comedian and screenwriter",
    "singer songwriter and producer",
    "sitarist and writer",
    "writer and chef",
    "music producer and singer songwriter",
    "rock vocalist and theme songwriter",
    "music writer and radio commentator",
    "songwriter and melodist",
    "songwriter and filmmaker",
    "singer songwriter and filmmaker",
    "writer and creative director",
    "harmonica player and singer songwriter",
    "novelist and travel writer",
    "media theorist and writer",
    "Beat writer",
    "folk rock singer songwriter",
    "songwriter and television author",
    "writer and aphorist",
    "Aromanian composer and songwriter",
    "radio producer and writer",
    "blues and gospel singer and songwriter",
    "writer and comics screenwriter",
    "humorist and television writer",
    "sculptor and songwriter",
    "Vepsian writer and",
    "blues guitarist and singer songwriter",
    "singer songwriter and multi instrumentalist",
    "Gujarati writer",
    "Canarian writer",
    "National Film Award winning filmmaker",
    "conductor and writer",
    "country folk singer songwriter",
    "doo wop singer and songwriter",
    "comic writer and artist",
    "writer and comic book artist",
    "modernist artist and writer",
    "songwriter and doo wop singer",
    "spy writer",
    "folk writer",
    "writer and war photographer",
    "science fiction writer and musician",
    "folk musician and singer songwriter",
    "writer and commentator",
    "light novel writer",
    "comic book writer and illustrator",
    "author and songwriter",
    "songwriter and pianist",
    "Oriya language writer",
    "writer and television broadcaster",
    "outlaw country music singer songwriter",
    "jazz musician and writer",
    "anime screenwriter",
    "Chicago blues guitarist",
    "screenwriter and writer",
    "writer and play theorist",
    "gospel music singer songwriter",
    "Pueblo writer",
    "children author and screenwriter",
    "Pulitzer Prize winning writer",
    "screenwriter and programme director",
    "writer and television presenter",
    "Marathi poet and writer",
    "blues singer songwriter and musician",
    "award winning children writer",
    "comic artist and writer",
    "novelist and comedy writer",
    "dramatist and scriptwriter",
    "scriptwriter and songwriter",
    "Urdu poet and songwriter",
    "cookery writer",
    "writer and newspaper publisher",
    "writer and documentary filmmaker",
    "playwright and short story writer",
    "writer and children books illustrator",
    "blues singer songwriter",
    "guitarist and writer",
    "producer and songwriter",
    "Marathi folk singer songwriter",
    "singer songwriter and television presenter",
    "Urdu short story writer",
    "writer and literature",
    "crime writer and critic",
    "beat writer",
    "science fiction and horror writer",
    "writer and television producer",
    "cartoonist and screenwriter",
    "Grammy Award winning singer songwriter and musician",
    "film and television writer and director",
    "media executive and writer",
    "TV presenter and writer",
    "theatre writer and director",
    "songwriter and editor",
    "underground comics artist",
    "movie director and comics writer",
    "Mi'kmaq folk singer",
    "newspaper columnist and writer",
    "science fiction writer and editor",
    "record and film producer",
    "television and radio writer and presenter",
    "writer and documentarist",
    "Hall of Fame country music singer and songwriter",
    "fake news writer and comedian",
    "Aranese writer",
    "science fiction and mystery writer",
    "and popular science writer",
    "and jingle writer",
    "satirist and writer",
    "food writer and chef",
    "Marathi writer",
    "honky tonk singer songwriter",
    "television director and screenwriter",
    "writer and restaurant critic",
    "costume designer and fashion writer",
    "thriller writer",
    "poet and children writer",
    "comedy writer and producer",
    "writer and editor of comic books",
    "film writer and director",
    "publisher and writer",
    "writer and television host",
    "painter and children writer",
    "writer and magazine editor",
    "filmmaker and screenwriter",
    "singer songwriter and pianist",
    "rock guitarist and songwriter",
    "poet and singer songwriter",
    "drummer and songwriter",
    "bassist and songwriter",
    "rapper and songwriter",
    "television comedy writer",
    "illustrator and children writer",
    "fantasy writer",
    "manga writer",
    "writer and columnist",
    "blues musician and songwriter",
    "television and film writer",
    "singer songwriter and composer",
    "rock singer songwriter",
    "singer songwriter and author",
    "model and singer songwriter",
    "record producer and songwriter",
    "soap opera writer",
    "writer of children books",
    "writer and photographer",
    "writer and television executive",
    "songwriter and music arranger",
    "children writer and illustrator",
    "author and scriptwriter",
    "pop singer songwriter",
    "dancer and writer",
    "journalist and screenwriter",
    "journalist and songwriter",
    "romance writer",
    "writer of novels",
    "fashion writer",
    "songwriter and music producer",
    "songwriter and recording artist",
    "film and television screenwriter",
    "journalist and children writer",
    "animator and screenwriter",
    "writer and singer",
    "artist and songwriter",
    "fantasy and science fiction writer",
    "songwriter and composer",
    "singer songwriter and visual artist",
    "children book writer and illustrator",
    "music writer",
    "screenwriter and television producer",
    "blues guitarist and songwriter",
    "writer of short stories",
    "rock musician and singer songwriter",
    "poet and screenwriter",
    "gardener and writer",
    "comedy scriptwriter",
    "comics artist and writer",
    "writer and film director",
    "musician and singer songwriter",
    "dramatist and screenwriter",
    "television director",
    "film critic and writer",
    "comedian and writer",
    "writer and dramatist",
    "film writer",
    "pop singer and songwriter",
    "television producer and screenwriter",
    "singer songwriter and record producer",
    "writer of children literature",
    "writer and composer",
    "songwriter and producer",
    "comic book writer and editor",
    "writer and essayist",
    "film and television writer",
    "lyricist and screenwriter",
    "television screenwriter and producer",
    "architect and writer",
    "model and writer",
    "writer and theatre director",
    "blues singer and songwriter",
    "songwriter and lyricist",
    "television screenwriter",
    "writer and newspaper editor",
    "horror writer",
    "keyboardist and songwriter",
    "film producer and screenwriter",
    "writer and satirist",
    "composer and songwriter",
    "theatre director and writer",
    "short story writer and novelist",
    "country music songwriter",
    "Hall of Fame songwriter",
    "broadcaster and writer",
    "novelist and writer",
    "non fiction writer and",
    "and non fiction writer",
    "non fiction writer",
    "writer and cartoonist",
    "writer and publicist",
    "country songwriter",
    "country musician and songwriter",
    "R&B singer and songwriter",
    "R&B singer songwriter",
    "children book writer",
    "poet and songwriter",
    "writer and comedian",
    "Hall of Fame singer songwriter",
    "country singer and songwriter",
    "writer and novelist",
    "cartoonist and writer",
    "language writer",
    "science fiction and fantasy writer",
    "writer and filmmaker",
    "composer and writer",
    "songwriter and singer",
    "guitarist and singer songwriter",
    "and food writer",
    "food writer and",
    "food writer",
    "producer and screenwriter",
    "writer and painter",
    "writer and biographer",
    "comic book artist and writer",
    "musician and writer",
    "screenwriter and film producer",
    "writer and film producer",
    "soul singer and songwriter",
    "art critic and writer",
    "crime fiction writer",
    "editor and writer",
    "comic book writer and artist",
    "writer and screenwriter",
    "folk singer and songwriter",
    "country music singer and songwriter",
    "film director and writer",
    "screenwriter and playwright",
    "writer and playwright",
    "screenwriter and author",
    "writer and publisher",
    "and comics writer",
    "comics writer",
    "singer songwriter and musician",
    "songwriter and musician",
    "literary critic and writer",
    "mystery writer",
    "and crime writer",
    "crime writer",
    "singer songwriter and guitarist",
    "songwriter and guitarist",
    "screenwriter and producer",
    "scriptwriter",
    "painter and writer",
    "writer and literary critic",
    "writer and literary",
    "folk singer songwriter and",
    "folk singer songwriter",
    "television producer and writer",
    "producer and writer",
    "writer and illustrator",
    "director and screenwriter",
    "artist and writer",
    "writer and critic",
    "writer and broadcaster",
    "writer and artist",
    "comedy writer",
    "writer and editor",
    "guitarist and songwriter",
    "novelist and short story writer",
    "songwriter and record producer",
    "writer and record producer",
    "country music singer songwriter",
    "screenwriter and director",
    "writer and director",
    "country singer songwriter",
    "author and screenwriter",
    "novelist and screenwriter",
    "screenwriter and film director",
    "playwright and screenwriter",
    "television writer and producer",
    "writer and producer",
    "comic book writer",
    "musician and songwriter",
    "children writer and",
    "children writer",
    "television writer for",
    "television writer",
    "and science fiction writer",
    "science fiction writer and",
    "science fiction writer",
    "journalist and writer",
    "writer and journalist",
    "writer and poet",
    "poet and writer",
    "singer and songwriter",
    "film director and screenwriter",
    "film director",
    "and singer songwriter",
    "singer songwriter and",
    "singer songwriter",
    "songwriter and vocalist",
    "songwriter and",
    "songwriter",
    "screenwriter and",
    "and screenwriter",
    "screenwriter",
    "writer and musician",
    "director and writer",
    "writer and author",
    "and copywriter",
    "copywriter",
    "and travel writer",
    "singer and writer",
    "travel writer and",
    "travel writer",
    "author and writer",
    "publicist and writer",
    "and hymn writer",
    "hymn writer",
    "writer in",
    "of and writer",
    "based writer",
    "Arabian writer and",
    "jazz writer and",
    "fiction writer and",
    "fiction writer",
    "head writer for",
    "and science writer",
    "and technology writer",
    "and writer",
    "writer and a",
    "writer and",
    "writer on",
    "writer",
]
sports = [
    "hiking",  # after arts
]
sciences = [
    "manual typewriter expert",  # before arts
    "speech therapist",  # before politics_govt_law
    "speech pathologist",
    "speech recognition researcher",
]

business_farming = [
    "insurance underwriter",  # before arts
]
academia_humanities = [
    "Yiddish language preservationist",
]
law_enf_military_operator = []
spiritual = ["ialorixá"]
social = ["society hostess"]
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict_7` Dictionary of Category Keys and Specific Role Lists of Values

In [849]:
# Combining separate lists into one dictionary
known_for_dict_7 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [850]:
%%time

# Dictionary version
search_dict = known_for_dict_7

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 8min 12s
Wall time: 8min 12s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
12045,1,Ray Walston,", 86, American actor .",https://en.wikipedia.org/wiki/Ray_Walston,19,2001,January,"Emmy, , , winner ,",,,,,,,,,,,,86.0,,United States of America,,"Emmy, 1995, 1996, winner ,",2.995732,0,0,0,0,0,1,0,0,0,0,0,0,1
49635,1,Elena Varzi,", 87, Italian actress , cardiac arrest.",https://en.wikipedia.org/wiki/Elena_Varzi,3,2014,September,,,,cardiac arrest,,,,,,,,,87.0,,Italy,,,1.386294,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [851]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_7.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 64866 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to build `known_for_dict_8` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [853]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [855]:
# # Code to check each value
# roles_list.pop()

'journalist'

<IPython.core.display.Javascript object>

In [1478]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "journalist" in df.loc[index, "info"]],
#         "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1475]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [843]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [1476]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "antivirus pioneer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [1477]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "photographer and journalist murdered"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1481]:
# Creating lists for each category
politics_govt_law = [
    "and publisher of Hitler",
]

arts = [
    "political cartoonist and journalist",
    "TV journalist",
    "arts journalist and television presenter",
    "journalist and Correspondent",
    "television critic and journalist",
    "photojournalist for magazine",
    "journalist responsible for North Caucasus news on Channel One",
    "journalist and TF",
    "photojournalist and cameraman",
    "CBS journalist",
    "jazz musician and journalist",
    "journalist editor",
    "journalist tried for sedition",
    "journalist and work",
    "photojournalist and documentary filmmaker",
    "journalist and owner of the opposition news website",
    "journalist https: web archive org web http: wcco com local local_story_ htmlurl= |archive date= October }}< ref>",
    "film critic and journalist",
    "journalist for al Arabiya",
    "jazz radio journalist and presenter",
    "journalist who helped return paintings looted by the Nazis",
    "journalist Shot by unknown assailant in Mogadishu",
    "film reviewer and journalist",
    "photojournalist known for her coverage of the War in",
    "journalist who reported on the Revolution",
    "CBS News journalist and correspondent",
    "video journalist",
    "photojournalist and member of the Magnum Photography Collective",
    "journalist and organiser",
    "journalist and father of Mette Marit",
    "fashion journalist and stylist",
    "journalist and documentarian of race issues",
    "talk show host and journalist",
    "BBC presenter and journalist",
    "essayist and journalist",
    "radio and TV journalist",
    "journalist and commentator",
    "journalist and Pulitzer Prize winning",
    "journalist and landscape architect",
    "radio broadcaster and literary journalist",
    "singer and fashion journalist",
    "journalist and music critic",
    "journalist and blogger",
    "journalists'",
    "journalist and student",
    "country music journalist",
    "film and music journalist",
    "broadcaster and food journalist",
    "fashion journalist and magazine editor",
    "journalist and Pulitzer Prize winning foreign correspondent",
    "journalist and wine critic",
    "journalist and television news director",
    "journalist and copy editor",
    "radio presenter and music journalist",
    "biographer and journalist",
    "television journalist and host",
    "journalist and syndicated columnist",
    "journalist and opinion researcher",
    "television journalist and announcer",
    "photojournalist and filmmaker",
    "wine journalist",
    "born journalist",
    "student and journalist",
    "journalist and newspaper columnist",
    "journalist and television anchor",
    "journalist and humorist",
    "journalist and father of Neil Young",
    "fashion journalist and author",
    "journalist and Pullitzer Prize winner",
    "television journalist and executive",
    "art critic and journalist",
    "journalist and war correspondent during World War II",
    "journalist and editor of the Daily Express",
    "film critic and television journalist",
    "journalist and radio television personality",
    "photojournalist and art photographer",
    "pioneering broadcast journalist",
    "journalist and literature",
    "journalist pamphleteer",
    "journalist and columnist for the San Francisco Chronicle",
    "publisher and journalist",
    "photographer and photo journalist",
    "photojournalist and Pulitzer Prize winner",
    "journalist and television executive",
    "newspaper journalist and columnist",
    "journalist and essayist",
    "journalist Pulitzer Prize winner",
    "television news reporter and journalist",
    "journalist and magazine editor of GQ",
    "journalist who worked for from through",
    "broadcaster and journalist for the BBC",
    "literary journalist",
    "broadcast journalist for NBC News who was  of",
    "journalist and obituarist",
    "World War II journalist",
    "journalist for Al Jazeera",
    "poet and literary journalist",
    "th century journalist",
    "jazz journalist and artistic director of the Brecon Jazz Festival",
    "freelance cameraman journalist",
    "maverick journalist",
    "soul music promoter and journalist",
    "WWII photojournalist",
    "journalist liver cancer columnist and contributing editor for",
    "journalist for the Daily Mail and award winning foreign correspondent",
    "freelance photo journalist",
    "journalist and newspaper executive",
    "journalist and media magnate",
    "journalist and public relations specialist",
    "photojournalist and photographer",
    "journalist and director of the Tour de",
    "journalist and newspaper director",
    "journalist and cooking books author",
    "radio journalist and news editor",
    "television personality and journalist",
    "journalist and non fiction author",
    "journalist and author of children books",
    "investigative journalist and television producer",
    "rock music journalist and editor",
    "radio disc jockey and rock music journalist",
    "Gujarati author and journalist",
    "jazz trombonist and journalist",
    "investigative journalist and news editor",
    "journalist and news editor",
    "fashion model and journalist",
    "freelance journalist and local",
    "journalist and TV host",
    "television host and journalist",
    "Balochi journalist",
    "literary agent and journalist",
    "radio producer and journalist",
    "publicist and journalist",
    "journalist and composer",
    "model and journalist",
    "journalist and founding editor of",
    "journalist and presenter",
    "singer and journalist",
    "Arabian journalist",
    "journalist and interior designer",
    "journalist and documentarist",
    "journalist and playwright",
    "consumer reporter and journalist",
    "filmmaker and journalist",
    "television and radio presenter",
    "cartoonist and journalist",
    "journalist and publicist",
    "journalist and TV presenter",
    "investigative journalist and crime reporter",
    "journalist and media director",
    "photojournalist and visual artist",
    "journalist and theater critic",
    "music journalist and music executive",
    "war photojournalist",
    "journalist and Olympic official",
    "East journalist",
    "comedian and journalist",
    "radio presenter and journalist",
    "journalist and correspondent",
    "journalist and lyricist",
    "press agent and journalist",
    "journalist and literary critic",
    "journalist and talk show host",
    "news journalist and executive",
    "journalist and news director",
    "television journalist and newsreader",
    "music journalist and biographer",
    "journalist and winner of the Pulitzer Prize",
    "journalist and food critic",
    "journalist and film producer",
    "music journalist and radio host",
    "journalist and television news executive",
    "Serb journalist and",
    "investigative journalist and filmmaker",
    "music radio journalist",
    "music journalist and singer",
    "journalist and singer",
    "show business journalist",
    "television news executive and journalist",
    "classical music journalist",
    "journalist and beauty queen",
    "literary critic and journalist",
    "cookery journalist",
    "broadcast executive and journalist",
    "painter and journalist",
    "photographer and journalist murdered",
    "Druze poet and journalist",
    "music journalist and jazz pianist",
    "newspaper journalist and journalism",
    "journalist and radio director",
    "newspaper journalist and author",
    "journalist and news reporter",
    "music journalist and promoter",
    "fashion journalist and magazine executive",
    "television journalist and editor",
    "arts journalist",
    "journalist and media executive",
    "journalist and entertainer",
    "entertainment journalist",
    "journalist and cameraman",
    "columnist and journalist",
    "journalist and Pulitzer Prize winner",
    "commentator and journalist",
    "journalist and media personality",
    "DJ and journalist",
    "music journalist and author",
    "journalist and documentary filmmaker",
    "television producer and journalist",
    "producer and journalist",
    "journalist and newscaster",
    "war correspondent and journalist",
    "music journalist and editor",
    "artist and journalist",
    "journalist and television personality",
    "journalist and radio producer",
    "journalist and satirist",
    "radio broadcaster and journalist",
    "journalist and foreign correspondent",
    "Pulitzer Prize winning journalist and author",
    "blogger and journalist",
    "investigative journalist and author",
    "newspaper journalist and",
    "newspaper journalist",
    "fashion journalist",
    "journalist and radio presenter",
    "journalist and columnist",
    "journalist and newspaper publisher",
    "journalist and poet",
    "journalist and radio host",
    "newspaper journalist and editor",
    "journalist and comedian",
    "musician and journalist",
    "music critic and journalist",
    "journalist and photographer",
    "journalist and television producer",
    "journalist and news presenter",
    "newspaper editor and journalist",
    "Pulitzer Prize winning journalist",
    "film journalist",
    "journalist and magazine editor",
    "broadcaster and journalist",
    "journalist and reporter",
    "radio and television journalist",
    "journalist and biographer",
    "television presenter and journalist",
    "journalist and film critic",
    "journalist and filmmaker",
    "journalist and war correspondent",
    "novelist and journalist",
    "journalist and television host",
    "photographer and photojournalist",
    "broadcast journalist",
    "poet and journalist",
    "and music journalist",
    "music journalist",
    "radio journalist",
    "journalist and novelist",
    "journalist and newspaper editor",
    "journalist and publisher",
    "investigative journalist",
    "journalist and broadcaster",
    "journalist and television presenter",
    "television journalist and",
    "television journalist",
    "author and journalist",
    "photojournalist and",
    "and photojournalist",
    "photojournalist",
    "journalist and author",
    "photo journalist",
    "editor and journalist",
    "online journalist and",
    "journalist and television",
    "journalist and radio",
    "journalist and executive",
    "caster and journalist",
    "jazz journalist",
    "journalist and critic",
    "journalist and editor of the",
    "journalist and editor of",
    "journalist and editor",
    "journalist of",
    "and journalist",
    "journalist and",
    "journalist",
]
sports = []
sciences = ["antivirus pioneer"]

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = ["and AQAP hostage"]
other_species = []

<IPython.core.display.Javascript object>

#### Hard-coding `cause_of_death` for Entries with Value in `info_2`

In [1482]:
# Hard-coding cause of death present for entry in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Martin_Adler"].index
df.loc[index, "cause_of_death"] = "shot by unknown assailant"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Marjorie_Williams"].index
df.loc[index, "cause_of_death"] = "liver cancer"

index = df[df["link"] == "https://en.wikipedia.org/wiki/Rub%C3%A9n_Espinosa"].index
df.loc[index, "cause_of_death"] = "murdered"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict_8` Dictionary of Category Keys and Specific Role Lists of Values

In [1483]:
# Combining separate lists into one dictionary
known_for_dict_8 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [1484]:
%%time

# Dictionary version
search_dict = known_for_dict_8

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 2min 29s
Wall time: 2min 29s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
10120,14,Peter Wildeblood,", 76, Anglo-Canadian journalist, novelist, playwright and gay rights campaigner.",https://en.wikipedia.org/wiki/Peter_Wildeblood,17,1999,November,,,,novelist,playwright and gay rights campaigner,,,,,,,,76.0,,Europe,Canada,,2.890372,0,0,0,0,0,1,0,0,0,0,0,0,0
69286,1,William Edward Phipps,", 96, American actor , complications from lung cancer.",https://en.wikipedia.org/wiki/William_Edward_Phipps,8,2018,June,", ,",,,complications from lung cancer,,,,,,,,,96.0,,United States of America,,", ,",2.197225,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [1485]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_8.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 62851 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to build `known_for_dict_9` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1487]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1830]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [1490]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [index for index in df.index if "businessman" in df.loc[index, "info"]],
        "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [1827]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [1828]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "part owner of the Atlanta Braves" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [844]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [1829]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "businessman and corporate raider"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [1832]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [
    "former owner of the Cleveland Cavaliers basketball team",
]
sciences = []

business_farming = [
    "businessman and hedge fund pioneer",
    "businessman and catering executive",
    "businessman and fy co owner involved in Fisher Price toy recall",
    "businessman and road transport magnate",
    "businessman who founded Pottery Barn",
    "businessman and founder of SeaWorld",
    "businessman and corporate director",
    "descent millionaire businessman",
    "money manager and businessman",
    "energy industrialist and businessman",
    "Gibraltarian businessman",
    "businessman and the one time head of the Gucci fashion house",
    "Arabian businessman",
    "businessman and manager",
    "investment banker and businessman",
    "businessman oil company executive and billionaire",
    "businessman and multimillionaire",
    "businessman and founder of the Lin Yuan Group",
    "businessman who founded Taylor Woodrow",
    "businessman and member of the Agnelli family",
    "businessman and CEO of The Coca Cola Company",
    "businessman and international financier",
    "businessman and co founder of DHL",
    "billionaire heir",
    "businessman and shopping center development pioneer",
    "businessman and member of the supermarket Sainsbury family",
    "businessman and CEO of Royal Shell",
    "businessman and farmer",
    "businessman and founder of The Rouse Company",
    "businessman and playboy",
    "businessman and industrialist",
    "businessman responsible for marketing the hula hoop and frisbee",
    "businessman and shipper",
    "businessman and candy magnate",
    "businessman and co founder of Sony along with Masaru Ibuka",
    "heir and businessman",
    "Born businessman",
    "businessman and president of The Walt Disney Company",
    "businessman and nightclub owner",
    "businessman and crop forecaster",
    "businessman and union leader",
    "billionaire and businessman",
    "billionaire businessman and",
    "businessman and vintner",
    "agricultural businessman",
    "cork businessman",
    "businessman and promoter",
    "internet entrepreneur and businessman",
    "businessman and investor",
    "businessman and uniform supplier to the International Ice Hockey Federation",
    "businessman and space tourist",
    "businessman and trade unionist",
    "businessman and founder of Café Coffee Day",
    "Arabian cosmetics businessman",
    "businessman and co founder of Walmart",
    "businessman and real estate mogul",
    "businessman and retail executive",
    "retail businessman",
    "emerald businessman",
    "transport businessman",
    "businessman and billionaire",
    "mixologist and businessman",
    "grocery businessman",
    "businessman and venture capitalist",
    "restaurateur and businessman",
    "businessman and investment banker",
    "businessman and winemaker",
    "beekeeper and businessman",
    "food production businessman",
    "businessman and company director",
    "millionaire businessman",
    "equine pharmaceutical businessman and",
    "quality control expert and businessman",
    "accountant and businessman",
    "real estate businessman",
    "industrialist and businessman",
    "businessman and corporate raider",
    "banker and businessman",
    "businessman and entrepreneur",
    "entrepreneur and businessman",
    "billionaire businessman",
    "n businessman and",
    "and businessman",
    "businessman and",
    "businessman",
]
academia_humanities = ["scholastic test preparation pioneer"]
law_enf_military_operator = []
spiritual = []
social = ["philanthropist known for his contribution to Central Park"]
crime = []
event_record_other = ["best known for rescuing Howard Hughes in plane crash"]
other_species = []

<IPython.core.display.Javascript object>

#### Creating `known_for_dict_8` Dictionary of Category Keys and Specific Role Lists of Values

In [1833]:
# Combining separate lists into one dictionary
known_for_dict_8 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [1839]:
%%time

# Dictionary version
search_dict = known_for_dict_8

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['business_farming'] ==1].sample(2)

CPU times: total: 46 s
Wall time: 46.1 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
37645,6,William Denis Brown III,", 80, American lawyer and businessman, Alzheimer's disease.",https://en.wikipedia.org/wiki/William_Denis_Brown_III,7,2012,March,,,lawyer,Alzheimer disease,,,,,,,,,80.0,,United States of America,,,2.079442,0,0,0,0,1,0,0,0,0,0,0,0,1
89456,23,Hans Rasmus Astrup,", 82, Norwegian businessman and art collector, founder of Astrup Fearnley Museum of Modern Art.",https://en.wikipedia.org/wiki/Hans_Rasmus_Astrup_(born_1939),7,2021,April,,,art collector,founder of Astrup Fearnley Museum of Modern Art,,,,,,,,,82.0,,Norway,,,2.079442,0,0,0,0,1,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [1840]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_8.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 61646 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to build `known_for_dict_10` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [1843]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [1845]:
# Code to check each value
roles_list.pop()

'author'

<IPython.core.display.Javascript object>

In [1846]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [index for index in df.index if "author" in df.loc[index, "info"]], "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [1847]:
# Code to check each specific value
specific_roles_list.pop()

'anarchist author and poet'

<IPython.core.display.Javascript object>

In [843]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [844]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [845]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict_10` Dictionary of Category Keys and Specific Role Lists of Values

In [849]:
# Combining separate lists into one dictionary
known_for_dict_10 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_10

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_10.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to build `known_for_dict_11` for the next iteration.

In [1841]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# Code to check each value
roles_list.pop()

''

<IPython.core.display.Javascript object>

In [8]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "writer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [842]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [843]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [844]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

<IPython.core.display.Javascript object>

In [845]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict_7` Dictionary of Category Keys and Specific Role Lists of Values

In [849]:
# Combining separate lists into one dictionary
known_for_dict_7 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_7

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_7.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to build `known_for_dict_8` for the next iteration.