# Wikipedia Notable Life Expectancies
# [Notebook 6 : Data Cleaning Part 5](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean5_thanak_2022_07_17.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean4.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean4", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98060 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,economist,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98058,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,journalist,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,0,0,0,1,0,0,0,1
98059,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
75719,12,Fernando J. Corbató,", 93, American computer scientist, developer of Multics, complications from diabetes.",https://en.wikipedia.org/wiki/Fernando_J._Corbat%C3%B3,8,2019,July,,,computer scientist,developer of Multics,complications from diabetes,,,,,,,,93.0,,United States of America,,,2.197225,0,0,0,0,0,0,0,0,0,0,0,0,0
22392,21,Hassan Gouled Aptidon,", 90, Dijiboutian first President, natural causes.",https://en.wikipedia.org/wiki/Hassan_Gouled_Aptidon,10,2006,November,,,first President,natural causes,,,,,,,,,90.0,,Djibouti,,,2.397895,0,0,0,0,0,0,0,0,0,0,0,0,0
97284,18,Barbara Hall,", 99, British crossword compiler and advice columnist.",https://en.wikipedia.org/wiki/Barbara_Hall_(editor),10,2022,April,,,crossword compiler and advice columnist,,,,,,,,,,99.0,,United Kingdom of Great Britain and Northern Ireland,,,2.397895,0,0,0,0,0,0,0,0,0,0,0,0,0
88324,9,Shuichi Murakami,", 70, Japanese jazz drummer.",https://en.wikipedia.org/wiki/Shuichi_Murakami,3,2021,March,,,jazz drummer,,,,,,,,,,70.0,,Japan,,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0
14705,30,Goo Arlooktoo,", 38, Canadian politician, heart attack.",https://en.wikipedia.org/wiki/Goo_Arlooktoo,7,2002,April,,,,heart attack,,,,,,,,,38.0,,Canada,,,2.079442,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98060 entries, 0 to 98059
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98060 non-null  object 
 1   name                       98060 non-null  object 
 2   info                       98060 non-null  object 
 3   link                       98060 non-null  object 
 4   num_references             98060 non-null  int64  
 5   year                       98060 non-null  int64  
 6   month                      98060 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98028 non-null  object 
 10  info_3                     48895 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting `known_for` values by creating `known_for_dict_7`

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [7]:
# Code to check each value
roles_list.pop()

''

<IPython.core.display.Javascript object>

In [8]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [index for index in df.index if "writer" in df.loc[index, "info"]], "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [246]:
# specific_roles_list.index("reggae singer and songwriter")

<IPython.core.display.Javascript object>

In [16]:
specific_roles_list = specific_roles_list[:810]

<IPython.core.display.Javascript object>

In [294]:
# Code to check each specific value
specific_roles_list.pop()

'National Film Award winning filmmaker'

<IPython.core.display.Javascript object>

In [86]:
# Example code to quick-screen values that may overlap categories
df.loc[[index for index in df.index if "ialorixá" in df.loc[index, "info"]]]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
72281,27,Mãe Stella de Oxóssi,", 93, Brazilian ialorixá and writer, stroke.",https://en.wikipedia.org/wiki/M%C3%A3e_Stella_de_Ox%C3%B3ssi,4,2018,December,,,ialorixá and writer,stroke,,,,,,,,,93.0,,Brazil,,,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [189]:
# Example code to quick-screen values that may overlap categories
df.loc[[index for index in df.index if "outdoor" in df.loc[index, "info"]]]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
3369,9,Tom Cole,", 88-89, Australian labourer, outdoorsman and author.",https://en.wikipedia.org/wiki/Tom_Cole_(stockman),5,1995,December,,,labourer,outdoorsman and author,,,,,,,,,88.5,,Australia,,,1.791759,0,0,0,0,0,0,0,0,0,0,0,0,0
5029,25,Tony Dauksza,", 84, American football player, film-maker, and outdoorsman.",https://en.wikipedia.org/wiki/Tony_Dauksza,14,1996,December,,,,film maker,and outdoorsman,,,,,,,,84.0,,United States of America,,,2.70805,0,0,0,0,0,0,1,0,0,0,0,0,1
8409,13,Michel Trudeau,", 23, Canadian outdoorsman, son of Prime Minister Pierre Trudeau and brother of PM Justin Trudeau, avalanche.",https://en.wikipedia.org/wiki/Michel_Trudeau,14,1998,November,,,outdoorsman,son of Prime Minister Pierre Trudeau and brother of PM Justin Trudeau,avalanche,,,,,,,,23.0,,Canada,,,2.70805,0,0,0,0,0,0,0,0,0,0,0,0,0
13367,31,Rex Forrester,", 72, New Zealand hunting and fishing specialist and outdoor sports author.",https://en.wikipedia.org/wiki/Rex_Forrester,5,2001,August,,,hunting and fishing specialist and outdoor sports author,,,,,,,,,,72.0,,New Zealand,,,1.791759,0,0,0,0,0,0,0,0,0,0,0,0,0
14737,5,Earl Shaffer,", 83, American outdoorsman and author.",https://en.wikipedia.org/wiki/Earl_Shaffer,11,2002,May,,,outdoorsman and author,,,,,,,,,,83.0,,United States of America,,,2.484907,0,0,0,0,0,0,0,0,0,0,0,0,0
26699,19,Tony Dean,", 67, American outdoors broadcaster, complications from appendectomy.",https://en.wikipedia.org/wiki/Tony_Dean_(conservationist),3,2008,October,,,outdoors broadcaster,complications from appendectomy,,,,,,,,,67.0,,United States of America,,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0
40781,23,Adolph Peschke,", 98, American outdoorsman, author and project designer in the Boy Scouts of America.",https://en.wikipedia.org/wiki/Adolph_Peschke,10,2012,November,,,outdoorsman,author and project designer in the Boy Scouts of,,,,,,,,,98.0,,United States of America,,,2.397895,0,0,0,0,0,0,0,0,0,0,0,0,0
46093,27,Åke Nordin,", 77, Swedish entrepreneur, founder of Fjällräven outdoor equipment.",https://en.wikipedia.org/wiki/%C3%85ke_Nordin,4,2013,December,,,entrepreneur,founder of Fjällräven outdoor equipment,,,,,,,,,77.0,,Sweden,,,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0
54120,17,Duff Holbrook,", 92, American biologist and outdoorsman, designer of rocket net for use in hunting.",https://en.wikipedia.org/wiki/Duff_Holbrook,4,2015,July,,,biologist and outdoorsman,designer of rocket net for use in hunting,,,,,,,,,92.0,,United States of America,,,1.609438,0,0,0,0,0,0,0,0,0,0,0,0,0
94552,10,Kev Reynolds,", 78, English outdoor writer.",https://en.wikipedia.org/wiki/Kev_Reynolds,9,2021,December,,,outdoor writer,,,,,,,,,,78.0,,United Kingdom of Great Britain and Northern Ireland,,,2.302585,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [214]:
# Example code to quick-check a specific entry
df[df["info_2"] == "food analyst and writer"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
86959,20,Wayne Roberts,", 76, Canadian food analyst and writer.",https://en.wikipedia.org/wiki/Wayne_Roberts_(activist),16,2021,January,,,food analyst and writer,,,,,,,,,,76.0,,Canada,,,2.833213,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = [
    'former executive editor of Richard Nixon and speech',
    'and free speech advocate',
    'freedom of speech advocate',
    'lawyer and free speech activist',
    'civil rights activist and speech',
    'and speech',
    'political speech',
    'and free speech activist',
    'speech',
    'Sestigers'

    
]

arts = [
    'rock and roll singer songwriter',
    'writer and radio dramatist',
    'writer of Berserker series',
    'writer and fashion editor',
    'comedy writer and lyricist',
    'writer and novelist in the Kannada language',
    'television writer for and',
    'writer of books and modules',
    'science fiction and western writer',
    'folk blues singer songwriter',
    'songwriter and columnist',
    'children writer and comic strip cartoonist',
    'spy fiction writer',
    'Emmy winning television comedy writer',
    'political journalist and food writer',
    'writer of all episodes of the long running radio serial',
    'writer of descent',
    'artist and screenwriter',
    'wine writer',
    'thriller writer and James Bond continuation novelist',
    'jazz writer',
    'writer and beer expert',
    'jazz musician and comedy writer',
    'television writer blacklisted by the House Un Activities Committee',
    'Motown songwriter and record producer',
    'writer and radio producer',
    'comic book writer and editor for DC Comics',
    'Chukchi language writer',
    'writer and script editor',
    'Christian rock singer songwriter',
    'writer of detective fiction',
    'novelist and non fiction writer',
    'cookbook writer',
    'playwright and scriptwriter',
    'motion picture writer director',
    'Broadway producer and writer',
    'film and television producer and screenwriter',
    'vocalist and songwriter',
    'country music manager and songwriter',
    'rock musician and songwriter',
    'classical guitarist and writer',
    'techno singer songwriter',
    'songwriter and drummer',
    'travel writer and broadcaster',
    'writer and editor',
    'writer and national artist',
    'Emmy nominated television writer',
    'nonfiction writer',
    'copywriter and executive',
    'comics writer and newspaper editor',
    'film director and script writer',
    'literary agent and writer',
    'writer of screenplays',
    'Braziliam writer and journalist',
    'co writer of the s national anthem',
    'songwriter and folksinger',
    'writer and radio broadcaster',
    'writer and journalist on',
    'novelist and soap opera writer',
    'country western songwriter',
    'libertarian writer and',
    'TV writer',
    'gardening writer',
    'architectural critic and writer',
    'animation writer producer',
    'and chess writer',
    'producer and writer of TV comedies',
    'lead singer songwriter musician of rock band God Lives Underwater',
    'resident writer and film critic',
    'writer and puppeteer for',
    'long time Associated Press cuisine writer',
    '"New Wave" vallenato singer and songwriter',
    'jazz pianist and film music writer',
    'R&B songwriter and record producer',
    'dramatist and writer',
    'writer editor publisher',
    'writer and Nobel Prize winner',
    'mystery novel writer',
    'Cree writer',
    'film producer and writer',
    'writer and architectural conservationist',
    'pop singer and writer',
    'poet writer',
    'writer and art critic',
    'experimental writer',
    'writer and wargame designer',
    'writer and news presenter',
    'songwriter and harpist',
    'writer and sound',
    'writer and music journalist',
    'potter and writer',
    'silent film screenwriter',
    'illustrator and writer',
    'sitcom writer',
    'percussionist and songwriter',
    'rock keyboardist and songwriter',
    'writer and speaker',
    'rhythm and blues and jazz musician and songwriter',
    'radio writer',
    'writer and television director',
    'writer and television personality',
    'and technical writer',
    'television writer and director',
    'short story and travel writer',
    'Motown songwriter',
    'psychedelic rock songwriter and musician',
    'Lakota writer and',
    'Romani writer',
    'BBC Radio writer',
    'manga artist and writer',
    'musician and score writer',
    'game show writer',
    'film producer and television screenwriter',
    'poet and writer in Gujarati language',
    'comedian and television writer',
    'writer and book editor',
    'television comedy writer and producer',
    'comic book writer and music critic',
    'writer and magician',
    'anime writer and producer',
    'writer and lyricist',
    'detective writer',
    'television and radio writer',
    'fiddler and singer songwriter',
    'Emmy Award winning screenwriter',
    'cook and food writer',
    'writer and investigative journalist',
    'singer songwriter and dancer',
    'songwriter and guitarist',
    'Academy Award nominated playwright and screenwriter',
    'First Nations singer songwriter',
    'cookery and travel writer',
    'songwriter and radio talk show host',
    'television host and writer',
    'Emmy Award winning television writer',
    'rockabilly musician and songwriter',
    'film and TV producer and writer',
    'Grammy winning songwriter',
    'jazz organist and songwriter',
    'copywriter and product spokesperson',
    'writer and newspaper editor',
    'speaker and writer',
    'painter and television screenplay writer',
    'comic strip writer and editor',
    'screenplay writer',
    'writer and editor of science fiction and fantasy',
    'writer and television journalist',
    'TV producer and writer',
    'Academy Award nominated screenwriter',
    'script writer and producer',
    'Urdu language writer',
    "musician and songwriter for Guns N' Roses",
    'songwriter and music promoter',
    'mystery writer and novelist',
    'writer and small press printer',
    'writer and theatre critic',
    'and poet and screenwriter',
    'writer and music executive',
    'playwright and writer',
    'poet and prose writer',
    'songwriter and television producer',
    'author and ghostwriter',
    'folksinger and songwriter',
    'writer under the pseudonym',
    'comic books illustrator and writer',
    'writer and Pulitzer Prize winner',
    'folk music songwriter and musician',
    'television scriptwriter',
    'songwriter and music manager',
    'writer and surrealist poet',
    'R&B and blues singer and songwriter',
    'short story writer and scriptwriter',
    'writer and Nobel Prize laureate',
    'comedy writer and radio and television personality',
    'writer and expert on country houses',
    'Pulitzer Prize winning war correspondent and writer',
    'writer and science fiction author',
    'novelist and a mystery writer',
    'Western writer',
    'writer and raconteur',
    'tenor and songwriter',
    'singer poet and songwriter',
    'folk music singer songwriter',
    'writer of Westerns',
    'ballet and dance writer',
    'author and the wife of writer Dylan Thomas',
    'literary critic and writer',
    'and film director and screenwriter',
    'writer and humorist',
    'composer and writer on music',
    'lyricist and songwriter',
    'writer and author',
    'rockabilly and country singer and songwriter',
    'writer and only child author Jack Kerouac and Joan Haverty Kerouac',
    'Poet; writer',
    'humorist and writer',
    'Urdu poet and screenwriter',
    'director and scriptwriter',
    'playwright and screenwriter specialising in comedies',
    'writer of crime fiction',
    'film critic and travel writer',
    'writer who won the Newbery Medal',
    'writer of short stories and poetry',
    'editor and writer on music and dance',
    'songwriter and impresario',
    'journalist and writer on music and',
    'writer and film',
    'songwriter and session musician',
    'screenwriter who won the Academy Award for Best Story for',
    'and writer on the language',
    'gospel bass singer and songwriter',
    'folk rock singer songwriter and film producer',
    'producer and film and television writer',
    'screenwriter and World War II foreign correspondent',
    'writer of fiction and screenplays',
    'songwriter and jazz pianist',
    'architectural photographer and writer',
    'writer of literature',
    '; also a world renowned poet and writer',
    'singer songwriter and guitarist of Big Country and The Raphaels',
    'comic strip and comic book writer and cartoonist',
    'Sami writer',
    'country and rockabilly singer',
    'born singer songwriter',
    'country music songwriter and record producer',
    'writer of the Old West',
    'Arabian writer of Arab descent',
    'rockabilly songwriter and singer',
    'and writer on Old Testament issues',
    'Blackfeet and Gros Ventre writer and poet',
    'newspaper reporter and writer',
    'comic book writer artist',
    'R&B musician songwriter',
    'agent for writers and cartoonists',
    'writer of stories and news articles',
    'singer songwriter and playwright',
    'writer of more than children books',
    'classical music broadcaster and writer',
    'entertainer and songwriter',
    'writer and illustrator of children books',
    'bebop and jazz pianist and songwriter',
    'television and film producer and screenwriter',
    'popular singer and songwriter',
    'screenwriter and producer for television',
    'Creole zydeco musician and songwriter',
    'writer of drama and short stories',
    'juvenile literature writer',
    'short story writer',
    'novelist and writer for children',
    'writer of comic books and strips',
    'editorial writer',
    'rhythm and blues and jazz singer and songwriter',
    'R&B musician and songwriter',
    'writer and director of plays and films',
    'short story writer and essayist',
    'jazz writer and record producer',
    'writer also known as ""',
    'reggae singer songwriter',
    'screenwriter and novelist',
    'and freethought writer',
    'comic book writer for DC Comics',
    'writer and movie director',
    'writer of pulp fiction',
    'media personality and writer',
    'songwriter and bass guitarist',
    'jazz and calypso musician and songwriter',
    'fiction writer and essayist',
    'poet and short story writer',
    'screenwriter and filmmaker',
    'novelist and story writer',
    'children author and writer',
    'comic strip writer and artist',
    'author and screenplay writer',
    'novelist and radio script writer',
    'writer from Nadu',
    'writer and music producer',
    'rock and roll singer and songwriter',
    'telenovela screenwriter',
    'comedy writer and theater owner',
    'cartoonist and comics writer and artist',
    'food writer and broadcaster',
    'documentary film director and screenwriter',
    'cartoonist and comic book writer',
    'journalist and culture writer',
    'rock singer and songwriter',
    'TV writer and producer',
    'comics writer and artist',
    'film director screenwriter and producer',
    'writer and masters',
    'jazz guitarist and songwriter',
    'nomadic lifestyle artist and writer',
    'writer and artist model',
    'bookseller and writer',
    'writer and art curator',
    'singer songwriter and keyboardist',
    'chef and cookbook writer',
    'poet and fiction writer',
    'music producer and songwriter',
    'Inuktitut pop singer and songwriter',
    'travel writer and journalist',
    'Guianese writer',
    'writer and golf historian',
    'writer and film producer',
    'film director and short story writer',
    'reggae singer and songwriter',
    'Hall of Fame country singer songwriter and musician',
    'troubadour and dialect writer',
    'designer and writer',
    'writer and dramaturge',
    'TV writer and novelist',
    'folk rock singer and songwriter',
    'and screenwriter and playwright',
    'writer and performer',
    'novelist and children writer',
    'Inuk writer',
    'songwriter and music publisher',
    'writer and literary',
    'Hall of Fame blues rock singer songwriter',
    'writer and existentialist poet',
    'comedy screenwriter',
    'children illustrator and writer',
    'photographer and writer',
    'Hall of Fame science fiction writer',
    'soul singer songwriter',
    'novelist and television writer',
    'country music journalist',
    'Hall of Fame guitarist and songwriter',
    'Hall of Fame country music singer songwriter',
    'jazz singer and songwriter',
    'singer songwriter and radio host',
    'blues singer songwriter and harmonicist',
    'youth writer',
    'musical theatre producer and writer',
    'red dirt singer songwriter',
    'biographer and children writer',
    'songwriter and television presenter',
    'writer and language',
    'radio presenter and writer',
    'theatrical and film critic',
    'newspaper editor and writer',
    'author and television writer',
    'house and gospel singer songwriter',
    'screenwriter and lyricist',
    'travel and food writer',
    'Gujarati writer and journalist',
    'author and screenwriter',
    'prankster and writer',
    'pasillo and pasacalle singer songwriter',
    'comic book writer and publisher',
    'Hall of Fame pianist and singer songwriter',
    'journalist and writer publicist',
    'trumpeter and songwriter',
    'graphic artist and writer',
    'horn player and writer',
    'writer and singer songwriter',
    'singer songwriter and painter',
    'writer and humourist',
    'bassist and singer songwriter',
    'investigative journalist and writer',
    'Gujarati language writer and',
    'comics writer and illustrator',
    'film director and scriptwriter',
    'folk singer songwriter and journalist',
    'visual artist and writer',
    'documentary film director and writer',
    'writer and cabaretist',
    'television writer and playwright',
    'First Nations writer',
    'comic book writer and novelist',
    'photojournalist and writer',
    'R&B singer songwriter and musician',
    'media critic and writer',
    'writer and bel canto singer',
    'comic book editor and writer',
    'jazz pianist and songwriter',
    'Native writer and',
    'writer and journalism',
    'radio broadcaster and writer',
    'songwriter and poet',
    'writer and lay',
    'gardening writer and television broadcaster',
    'singer songwriter and writer',
    'death metal singer songwriter and guitarist',
    'telenovela writer',
    'writer and orator',
    'children writer and book editor',
    'cinematic historian and writer',
    'country singer songwriter and music executive',
    'singer songwriter and media director',
    'art collector and writer',
    'writer and visual artist',
    'southern soul singer songwriter',
    'singer songwriter and reality show contestant',
    'screenwriter and essayist',
    'comedian and screenwriter',
    'singer songwriter and producer',
    'sitarist and writer',
    'writer and chef',
    'music producer and singer songwriter',
    'rock vocalist and theme songwriter',
    'music writer and radio commentator',
    'songwriter and melodist',
    'songwriter and filmmaker',
    'singer songwriter and filmmaker',
    'writer and creative director',
    'harmonica player and singer songwriter',
    'novelist and travel writer',
    'media theorist and writer',
    'Beat writer',
    'folk rock singer songwriter',
    'songwriter and television author',
    'writer and aphorist',
    'Aromanian composer and songwriter',
    'radio producer and writer',
    'blues and gospel singer and songwriter',
    'writer and comics screenwriter',
    'humorist and television writer',
    'sculptor and songwriter',
    'Vepsian writer and',
    'blues guitarist and singer songwriter',
    'singer songwriter and multi instrumentalist',
    'Gujarati writer',
    'Canarian writer',
    'National Film Award winning filmmaker'
    
    
    
    
    
    'singer and songwriter',
    'and singer songwriter',
    'and travel writer',
    'singer and writer',
    'writer and poet',
    'travel writer and',
    'songwriter and vocalist',
    'author and writer',
    'publicist and writer',
    'folk singer songwriter and',
    'singer songwriter and',
    'singer songwriter',
    'screenwriter and',
    'and comics writer',
    'writer and journalist',
    'and hymn writer',
    'hymn writer',
    'writer in',
    'of and writer',
    'and science fiction writer',
    'based writer',
    'and crime writer',
    'and non fiction writer',
    'Arabian writer and',
    'children writer and',
    'and food writer',
    'jazz writer and',
    'science fiction writer and',
    'non fiction writer and',
    'fiction writer and',
    'food writer and',
    'head writer for',
    'and screenwriter',
    'and science writer',
    'and technology writer',
    'television writer for',
    'songwriter and',
    'and writer',
    'writer and a',
    'writer and',
    'writer on'
]
sports = [
    'hiking',  # after arts

]
sciences = [
    'manual typewriter expert', # before arts
    'speech therapist', # before politics_govt_law
    'speech pathologist',
    'speech recognition researcher'
]

business_farming = [
]
academia_humanities = [
    'Yiddish language preservationist',
]
law_enf_military_operator = []
spiritual = [
    'ialorixá'
]
social = [
    'society hostess'
]
crime = []
event_record_other = []
other_species = [
]

#### Creating `known_for_dict_1` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict_1 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "arts": arts,
    "sports": sports,
    "law_enf_military_operator": law_enf_military_operator,
    "politics_govt_law": politics_govt_law,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_1

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_1.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

In [None]:
#### Observations:
- We will proceed to build `known_for_dict_6` for the next iteration.