# Wikipedia Notable Life Expectancies
# [Notebook 7 : Data Cleaning Part 6](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean6_thanak_2022_07_24.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean5.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean5", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98060 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,economist,writer,and academic,,,,,,,,68.0,,Ireland,,,2.564949,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98058,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98059,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
84934,29,Valeriy Babych,", 67, Ukrainian politician, Deputy , economist and businessman, COVID-19.",https://en.wikipedia.org/wiki/Valeriy_Babych,16,2020,October,,,,Deputy,economist and businessman,COVID,,,,,,,67.0,,Ukraine,,1994 2002,2.833213,0,0,0,0,0,0,0,0,1,0,0,0,1
40911,3,"Peter Johnson, Sr.",", 91, American trial lawyer and political power broker, pulmonary fibrosis.","https://en.wikipedia.org/wiki/Peter_Johnson,_Sr.",3,2012,December,,,trial lawyer and political power broker,pulmonary fibrosis,,,,,,,,,91.0,,United States of America,,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0
29951,10,Dilip Chitre,", 70, Indian poet, cancer.",https://en.wikipedia.org/wiki/Dilip_Chitre,3,2009,December,,,poet,cancer,,,,,,,,,70.0,,India,,,1.386294,0,0,0,0,0,0,0,0,0,0,0,0,0
65585,6,Gao Mang,", 90–91, Chinese translator.",https://en.wikipedia.org/wiki/Gao_Mang,12,2017,October,,,translator,,,,,,,,,,90.5,,"China, People's Republic of",,,2.564949,0,0,0,0,0,0,0,0,0,0,0,0,0
44442,13,Damon Intrabartolo,", 39, American playwright .",https://en.wikipedia.org/wiki/Damon_Intrabartolo,7,2013,August,"and orchestrator ,",,playwright,,,,,,,,,,39.0,,United States of America,,"and orchestrator ,",2.079442,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98060 entries, 0 to 98059
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98060 non-null  object 
 1   name                       98060 non-null  object 
 2   info                       98060 non-null  object 
 3   link                       98060 non-null  object 
 4   num_references             98060 non-null  int64  
 5   year                       98060 non-null  int64  
 6   month                      98060 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98028 non-null  object 
 10  info_3                     48896 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by creating `known_for_dict_14`.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2`

In [6]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [359]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [265]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[[index for index in df.index if "poet" in df.loc[index, "info"]], "info_2",]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [361]:
# # Code to check each specific value
# specific_roles_list.pop()

<IPython.core.display.Javascript object>

In [263]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "poet and poetry" in df.loc[index, "info"]]]

<IPython.core.display.Javascript object>

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [262]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "poet and fy worker"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [261]:
# Creating lists for each category
politics_govt_law = []

arts = [
    "Pulitzer Prize winning poet and former poet laureate",
    "poet of the School",
    "poet and dramatist",
    "Nuyorican poet and playwright",
    "poet and memoirist",
    "poet and BBC producer",
    "Mi'kmaq poet",
    "poet and Pulitzer Prize winner",
    "poet and arts critic",
    "poet and radio host",
    "novelist and poetry promoter",
    "poet and diarist",
    "poet and broadcaster",
    "surrealist poet",
    "Chicano poet",
    "Martiniquan poet",
    "Māori poet",
    "poet who wrote about the Dust Bowl",
    "vocalist and poet",
    "satirist and humorist poet of",
    "Movement poet",
    "avant garde poet and visual artist",
    "poet and literary book publisher",
    "poet of the Beat Generation",
    "Latino poet",
    "contemporary poet",
    "film maker and poet",
    "beat poet",
    "biographer and poet",
    "poet and jazz musician",
    "poet and architecture critic",
    "K'iche' Maya poet",
    "visual artist and poet",
    "poet and biographer",
    "jazz pianist and poet",
    "rhythm poet and musician",
    "dub poet",
    "percussionist and poet",
    "poet and jazz pianist",
    "poet and spoken word musician",
    "poet and sculptor",
    "experimental poet",
    "poet and radio broadcaster",
    "Marathi ghazal poet",
    "poet and co founder of interstitial lung disease",
    "avant garde composer and poet",
    "magazine publisher and poet",
    "literary critic and poet",
    "sculptor and poet",
    "poet and proofreader",
    "Odia poet",
    "folk musician and poet",
    "Ulster Scots poet",
    "vernacular poet",
    "Pashto poet",
    "poet and recording artist",
    "poet and disc jockey",
    "Kannada language poet",
    "and spoken word poet",
    "poet and film producer",
    "Nuyorican poet",
    "Kannada poet",
    "surrealist poet and art critic",
    "director and poet",
    "Kashubian poet",
    "poet and poetry",
    "art critic and poet",
    "poet and art critic",
    "jazz musician and poet",
    "Native poet",
    "poet and composer",
    "poet and cartoonist",
    "poet and filmmaker",
    "Pulitzer Prize winning poet",
    "photographer and poet",
    "poet and visual artist",
    "musician and poet",
    "poet and performance artist",
    "poet and musician",
    "lyricist and poet",
    "painter and poet",
    "poet and artist",
    "poet and publisher",
    "playwright and poet",
    "poet and painter",
    "poet and lyricist",
    "Urdu poet and",
    "Urdu poet",
    "poet and essayist",
    "artist and poet",
    "poet and critic",
    "poet and editor",
    "poet and literary critic",
    "novelist and poet",
    "poet and playwright",
    "poet and novelist",
    "Occitan language poet and",
    "language poet",
    "Arabian poet and",
    "poetess",
    "poet and literary",
    "Beat generation poet and",
    "Beat Generation poet",
    "Beat poet",
    "prize winning poet and",
    "n poet",
    "poet laureate",
    "poet and",
    "and poet",
    "poets",
    "poet",
]
sports = []
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

<IPython.core.display.Javascript object>

In [362]:
# Hard-coding cause_of_death value found in info_2
index = df[df["link"] == "https://en.wikipedia.org/wiki/Lawrence_Ferlinghetti"].index
df.loc[index, "cause_of_death"] = "interstitial lung disease"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict_14` Dictionary of Category Keys and Specific Role Lists of Values

In [363]:
# Combining separate lists into one dictionary
known_for_dict_14 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2`

In [364]:
%%time

# Dictionary version
search_dict = known_for_dict_14

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

CPU times: total: 1min 3s
Wall time: 1min 3s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
95191,9,Akira Inoue,", 93, Japanese film director , stroke and pneumonia.",https://en.wikipedia.org/wiki/Akira_Inoue_(film_director),8,2022,January,", ,",,,stroke and pneumonia,,,,,,,,,93.0,,Japan,,", ,",2.197225,0,0,0,0,0,1,0,0,0,0,0,0,1
20358,27,Ronald Pearsall,", 77, English author.",https://en.wikipedia.org/wiki/Ronald_Pearsall,4,2005,September,,,,,,,,,,,,,77.0,,United Kingdom of Great Britain and Northern Ireland,,,1.609438,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking the Number of Rows without a First Category

In [365]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_14.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

There are 53857 entries without any known_for category.


<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to build `known_for_dict_15` for the next iteration.

#### Finding `known_for` Roles in `info_2`

In [367]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [369]:
# Code to check each value
roles_list.pop()

'artist'

<IPython.core.display.Javascript object>

In [370]:
# Create specific_roles_list for above popped value
specific_roles_list = (
    df.loc[
        [index for index in df.index if "artist" in df.loc[index, "info"]], "info_2",
    ]
    .value_counts()
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [735]:
# Code to check each specific value
specific_roles_list.pop()

'electronic music artist and MC'

<IPython.core.display.Javascript object>

In [721]:
# Example code to quick-screen values that may overlap categories
df.loc[[index for index in df.index if "motorsports artist" in df.loc[index, "info"]]]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
73288,16,Sam Bass,", 57, American motorsports artist, sepsis.",https://en.wikipedia.org/wiki/Sam_Bass_(artist),11,2019,February,,,motorsports artist,sepsis,,,,,,,,,57.0,,United States of America,,,2.484907,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [606]:
# Example code to quick-check a specific entry
df[
    df["info_2"]
    == "make up artist and first civilian to receive the Intelligence Medal of Merit"
]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
13331,25,John Chambers,", 78, American make-up artist and first civilian to receive the Intelligence Medal of Merit.",https://en.wikipedia.org/wiki/John_Chambers_(make-up_artist),17,2001,August,,,make up artist and first civilian to receive the Intelligence Medal of Merit,,,,,,,,,,78.0,,United States of America,,,2.890372,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = [
    'based graffiti artist whose works were included in the Venice Biennale',
    'game artist',
    'Oscar winning make up artist',
    'choreographer appointed as Sydney Dance Company artistic director',
    'based pop artist',
    'artist and wife of Joaquín Torres García',
    'botanical artist',
    'television make up artist',
    'artist and industrial designer',
    'movie poster artist',
    'courtroom artist',
    'artist and engraver',
    'graphic designer and psychedelic artist',
    'special effects and make up artist',
    'comic book artist and publisher',
    'plastic artist',
    'artist and theatre designer',
    'pop artist and director',
    'artistic draughtsman',
    'artist known for his role in the Conceptualism and Minimalism movements',
    'aboriginal artist',
    'native artist',
    'musician and artist',
    'ballet dancer and artistic director of Ballet',
    'master potter and artist',
    'drag queen music artist',
    'internationally recognized graphic artist',
    'fantasy and science fiction artist and illustrator',
    'internationally exhibited Navajo artist',
    'comic book artist and co founder of',
    'artist who was a member of the Regina Five',
    'installation artist',
    'animator and layout artist',
    'colorist and cover production artist for DC Comics',
    'special effects artist and pedal steel guitarist',
    'comic book colourist and artist',
    'based architect and artist',
    'installation artist and assemblage sculptor',
    'artist and television presenter',
    'movie artist and illustrator',
    'artist and watercolourist',
    'naïve artist',
    'transgender artist',
    'special effects artist and producer',
    'artist and musical performer',
    'and butter sculpture artist',
    'artist and print maker',
    'pop artist and sculptor',
    'experimental music artist',
    'blues musician and artist',
    'comic book artist and reputed creator of',
    'artist and composer',
    'ceramic artist and designer',
    'video game concept artist',
    'artist and reporter',
    'performance artist and playwright',
    'artist and muralist',
    'animation artist and character designer',
    'blues artist',
    'Academy Award winning visual effects artist',
    'vocalist and bassist and solo artist',
    'artist of origin',
    'neo mannerist artist',
    'shadow play artist',
    'sculptor and conceptual artist',
    'film make up artist',
    'tapestry and textile artist',
    'Golden Age comic book artist',
    'rock musician and artist',
    'strip artist',
    'wood carving artist',
    'plastic artist and illustrator',
    'artist and banknote designer',
    'textile artist and printmaker',
    'fantasy and science fiction artist',
    'figurative expressionist artist',
    'hip hop musician and graffiti artist',
    'Navajo artist',
    'wet folding origami artist',
    'Route artist',
    'animator and comic book artist',
    'R&B artist',
    'mural artist',
    'makeup artist and tenor',
    'artist and lecturer',
    'artist and ceramicist',
    'experimental visual artist',
    'gospel music artist',
    'vocalist and session artist',
    "artists' model and memoirist",
    'comic book and comic strip artist',
    'comic strip artist and editor',
    'Native artist and potter',
    'West Coast artist',
    'artist and comic book creator',
    'graphic artist and sculptor',
    'painter and visual artist',
    'New Realist artist',
    'motion picture matte artist',
    'artist and novelist',
    'abstract and representational artist',
    'broadcaster and comic book artist',
    'neo conceptual artist',
    'painter and pioneering manhua artist',
    'hillbilly and bluegrass artist',
    'artist and member of the Fluxus movement',
    'artist and member of the Ultra Lettrist movement',
    'graphic artist and printmaker',
    'musician and recording artist',
    'textile artist who specialized in embroidery',
    'ink artist and wife of Walt Disney',
    'artist and art collector',
    'n artist from Utopia',
    'Chicano artist',
    'artist of origins',
    'artist and doll maker',
    'conceptual and performance artist',
    'studio potter and ceramic artist',
    'tenor and artist',
    'country music artist',
    'visual artist known for her still lives and landscapes',
    'film and video artist',
    'artist and watercolor master',
    'painter and comics artist',
    'vocalist and recording artist',
    'comic book artist born',
    'science fiction and fantasy artist',
    'novelist and artist',
    'conductor and recording artist',
    'botanical artist and art critic',
    'carving artist',
    'visual artist and conceptual sculptor',
    'contemperary artist',
    'sculptor and "one of the nation most accomplished medallic artists"',
    'cartoonist and comic artist',
    'rock artist',
    'psychedelic artist',
    'animation and comic book artist',
    'artist and fashion designer',
    'textile artist and embroiderer',
    'artist and experimental photographer',
    'cartoonist and comic book artist',
    'potter and ceramic artist',
    'artist anddesigner',
    'born artist',
    'sculptor and stained glass artist',
    'equine artist',
    'typographer and graphic artist',
    'visual artist and jewelry and fashion designer',
    'visual artist and protégé of Salvador Dalí',
    'reggae artist',
    'artist and Army art correspondent',
    'film special effects artist',
    'vocalist and artist',
    'experimental filmmaker and artist',
    'fine artist and art editor',
    'comic book artist and book illustrator',
    'Les Automatistes artist and a member of',
    'comic book artist and co creator of Jonah Hex and Black Orchid',
    'Papunya Tula artist',
    'graphic artist and postage stamp designer',
    'artist and metalsmith',
    'manhua artist',
    'illustrator and storyboard artist',
    'Western artist',
    'artist and architectural designer',
    'graphic artist and banknote designer',
    'artist in wood',
    'Yakshagana artist',
    'trapeze artist',
    'newspaper artist and cartoonist',
    'artist and puppeteer',
    'ceramic artist and sculptor',
    'furniture designer and artist',
    'rock album cover artist',
    'stateless auto destructive artist',
    'indigenous artist and printmaker',
    'computer artist',
    'watercolour artist',
    'media artist and designer',
    'artist and landscape architect',
    'Eurodance artist',
    'musician and comic book artist',
    'graphic artist and game designer',
    'comics artist and graphic novelist',
    'hologram artist',
    'cartoonist and comics artist',
    'comics artist and animator',
    'reggae artist and comedian',
    'sound installation artist and musician',
    'artist and potter',
    'artist and film production illustrator',
    'multimedia artist and painter',
    'film animator',
    'artist photographer',
    'sound artist',
    'experimental filmmaker and visual artist',
    'artist and jewelry designer',
    'graphic designer and film poster artist',
    'fetish artist',
    'avant garde installation artist and sculptor',
    'courtroom sketch artist',
    'sketch artist',
    'caricaturist and comics artist',
    'Iñupiat artist',
    'hand shadow artist',
    'Indigenous artist',
    'musical Thavil artist',
    'ceramist and textile artist',
    'radio presenter and artist',
    'comics artist and cartoonist',
    'artist and cartoonist',
    'lianhuanhua artist',
    'multi media artist',
    'animator and comics artist',
    'artist and cultural figure',
    'typographer and visual artist',
    'graphic designer and album artist',
    'pencil artist',
    'theatre artist and playwright',
    'comic artist and illustrator',
    'motorsports artist',
    'voice over and recording artist',
    'bassist and artist',
    'comic creator and cover artist',
    'tattoo artist and reality show personality',
    'cartoonist and street artist',
    'sand artist',
    'architect and light artist',
    'media producer and makeup artist',
    'Kiowa artist',
    'Peking opera artist',
    'electronic music artist and MC'
    
    
    
    

    'comic book artist and',
    'visual effects artist and',
    'performance artist and',
    'and performance artist',
    'poster artist and',
    'poster artist',
    'comics artist and',
    'visual artist and',
    'and landscape artist',
    'ceramic artist and',
    'and comics artist',
    'and storyboard artist',
    'storyboard artist',
    'make up artist and',
    'abstract expressionist artist and',
    'modern artist and',
    'and graphic artist',
    'graphic artist and',
    'and fish skin artist',
    'and visual artist',
    'and artist',
    'artist and',
    'n artist'
]
sports = [
    'Muay martial artist', # before arts
    'martial artist and Isshinryu karate pioneer',
    'professional wrestler and mixed martial artist',
    'martial artist and founder of Modern Arnis',
    'Olympic artistic gymnast',
    'professional wrestler and martial artist',
    'artistic gymnastics coach'

    
    'martial artist and',
    'and martial artist',
    'artistic gymnast and Olympian'


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = [
    'first civilian to receive the Intelligence Medal of Merit'
]
spiritual = [
    'modern primitive proponent'
]
social = []
crime = [
    'scam artist', # before arts
    'con artist and',
]
event_record_other = []
other_species = []

#### Creating `known_for_dict_15` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict_15 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_15

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_15.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to build `known_for_dict_16` for the next iteration.

In [366]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [index for index in df.index if "writer" in df.loc[index, "info"]], "info_2",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Code to check each specific value
# specific_roles_list.pop()

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "and science writer" in df.loc[index, "info"]]]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[
#     [
#         index
#         for index in df.index
#         if "outlaw country music singer songwriter" in df.loc[index, "info"]
#     ]
# ]

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category
politics_govt_law = []

arts = []
sports = [


]
sciences = []

business_farming = []
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = []

#### Creating `known_for_dict_7` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict_7 = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

#### Extracting Category from `info_2`

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_7

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_lst in search_dict.items():
    for role in category_lst:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['arts'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
#### Checking the number of rows without a first category
df["num_categories"] = df[known_for_dict_7.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

#### Observations:
- We will proceed to build `known_for_dict_8` for the next iteration.