# Wikipedia Notable Life Expectancies
# [Notebook 6: Data Cleaning Part 5](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean5_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean4.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean4", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98045 rows and 48 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,,,,,ballet designer,director,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,,,,,writer,,,and academic,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98043,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",50.0,,Pakistan,,"2002 2007, since 2018",4.60517,,politician,,,MNA,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1
98044,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,,,,,member of the Academy of Engineering,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
42377,10,"Princess Lilian, Duchess of Halland",", 97, Welsh-born Swedish royal.","https://en.wikipedia.org/wiki/Princess_Lilian,_Duchess_of_Halland",15,2013,March,,97.0,,Wales,Sweden,,2.772589,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1
27422,17,Edmund Leopold de Rothschild,", 93, British financier and horticulturist.",https://en.wikipedia.org/wiki/Edmund_Leopold_de_Rothschild,6,2009,January,,93.0,,United Kingdom of Great Britain and Northern Ireland,,,1.94591,,horticulturist,,,,,,,,,,,,,,,,,,,,0,0,0,0,1,0,0,0,0,0,0,0,1
70166,1,Bhishma Narain Singh,", 85, Indian politician, Governor of Assam .",https://en.wikipedia.org/wiki/Bhishma_Narain_Singh,3,2018,August,and Nadu,85.0,,India,,1984 1989 and Tamil Nadu 1991 1993,1.386294,,,,,Governor of Assam,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1
27172,22,Anand Babla,", 54, Fijian politician, MP , after long illness.",https://en.wikipedia.org/wiki/Anand_Babla,3,2008,December,,54.0,,Fiji,,1992 2006,1.386294,,,,,MP,,,after long illness,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,1,0,0,0,1
9790,28,Dave Pope,", 78, American baseball player.",https://en.wikipedia.org/wiki/Dave_Pope,6,1999,August,,78.0,,United States of America,,,1.94591,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,0,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98045 entries, 0 to 98044
Data columns (total 48 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98045 non-null  object 
 1   name                       98045 non-null  object 
 2   info                       98045 non-null  object 
 3   link                       98045 non-null  object 
 4   num_references             98045 non-null  int64  
 5   year                       98045 non-null  int64  
 6   month                      98045 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98045 non-null  float64
 9   cause_of_death             13 non-null     object 
 10  place_1                    97891 non-null  object 
 11  place_2                    8115 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98045 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for values by rebuilding `known_for_dict`.
- We will proceed with the next `info_2` column, `info_2_1`, as `info_2` is the Wikipedia field that contains the majority of `known_for` information.

### Extracting `known_for` Continued

#### Finding `known_for` Roles in `info_2_1`

In [246]:
# # Obtaining values for column and their counts
# roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [247]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

In [248]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "coach" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [249]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [250]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "headmaster"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [251]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "politician who served as President of the Vermont State Senate",
    "politician Senator from Delaware from to",
    "politician in Valencian Community",
    "former Newfoundland politician",
    "anti communist politician",
    "Conservative politician",
    "Labour Party politician",
    "oppositional politician",
    "nationalist politician",
    "Republican politician",
    "pan Turkic politician",
    "communist politician",
    "post war politician",
    "one time politician",
    "Unionist politician",
    "eventual politician",
    "Marxist politician",
    "Labour politician",
    "Green politician",
    "politician from",
    "nazi politician",
    "East politician",
    "politician in",
    "a politician",
    "politician",
    "speechwriter for President",
    "defence expert",
    "acting president of",
    "anti divorce activist",
    "former head of the WHO AIDS program",
    "head of the dynasty",
    "head of the F D A",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "screenwriter specialising in comedies",
    "the wife of writer Dylan Thomas",
    "television screenplay writer",
    "songwriter for Guns N' Roses",
    "pasacalle singer songwriter",
    "writer in Gujarati language",
    "singer songwriter",
    "television screenwriter",
    "science fiction writer",
    "roll singer songwriter",
    "writer of TV comedies",
    "writer for children",
    "comics screenwriter",
    "radio script writer",
    "short story writer",
    "non fiction writer",
    "Gros Ventre writer",
    "singer songwriter",
    "television writer",
    "comic book writer",
    "screenplay writer",
    "film music writer",
    "soap opera writer",
    "theme songwriter",
    "writer publicist",
    "a mystery writer",
    "children writer",
    "writer on music",
    "cookbook writer",
    "fantasy writer",
    "fashion writer",
    "mystery writer",
    "fiction writer",
    "culture writer",
    "dialect writer",
    "western writer",
    "travel writer",
    "comics writer",
    "comedy writer",
    "horror writer",
    "jingle writer",
    "script writer",
    "screenwriter",
    "scriptwriter",
    "radio writer",
    "crime writer",
    "score writer",
    "dance writer",
    "story writer",
    "prose writer",
    "food writer",
    "film writer",
    "ghostwriter",
    "songwriter",
    "copywriter",
    "vocal coach involved in the Wrong Door Raid",
    "vocal session arranger",
    "Yiddish vocalist",
    "blues vocalist",
    "vocal coach",
    "vocalist",
    "acting coach",
    "track maker",
    "drama",
    "radio dramatist",
    "drama teacher",
    "drama coach",
    "dramaturge",
    "dramatist",
    "voice artist voice of the Flower Pot Men",
    "television voice actor",
    "voice dubbing artist",
    "voice of Baby Huey",
    "voice over artist",
    "voice over actor",
    "a voice teacher",
    "voice actress",
    "voice teacher",
    "voice artist",
    "voice actor",
    "voice coach",
    "former head of the Horticultural Society",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "general manager of the Dallas Cowboys professional football team",
    "former owner of the Cleveland Cavaliers basketball team",
    "radio announcer for the Lakers basketball team since",
    "first head coach of the women national team",
    "principal of the Toyota F racing team",
    "national softball team member",
    "Formula One team principal",
    "baseball team part owner",
    "women basketball teams",
    "national team captain",
    "national team manager",
    "football team owner",
    "baseball team owner",
    "cricket team coach",
    "partial team owner",
    "sports team owner",
    "racing team owner",
    "hockey team owner",
    "race team owner",
    "team manager",
    "team leader",
    "team owner",
    "field team",
    "coach in the National Hockey League",
    "World Hockey Association defenseman",
    "Commonwealth Games athletics coach",
    "college athletics administrator",
    "athletics director",
    "athletics coach",
    "former owner of the Cleveland Cavaliers basketball team",
    "for the Lakers basketball team since",
    "Ohio State University basketball player",
    "high school basketball coach",
    "wheelchair basketball player",
    "college basketball player",
    "women basketball teams",
    "NBL basketball player",
    "basketball executive",
    "basketball",
    "basketball player",
    "basketball coach",
    "general manager of the Dallas Cowboys professional football team",
    "football placekicker with the Dallas Cowboys",
    "chairman of Bradford City football club",
    "professional rugby league footballer",
    "died on the football pitch in Lyon",
    "footballer for Newport County",
    "professional football player",
    "rugby league football player",
    "World Cup football referee",
    "high school football coach",
    "gridiron football player",
    "rugby league footballer",
    "football administrator",
    "football club chairman",
    "football club director",
    "college football coach",
    "World Cup footballer",
    "football team owner",
    "football club owner",
    "football executive",
    "CFL football coach",
    "football chairman",
    "football official",
    "football manager",
    "football referee",
    "football coach",
    "footballer",
    "Pittsburgh Penguins coach",
    "coach for the NFL Giants",
    "NFL player",
    "silver medallist handball player",
    "beach handball coach",
    "handball player",
    "figure skating",
    "Olympic figure skater",
    "figure skating coach",
    "speed skater",
    "international coach",
    "Olympic field hockey player",
    "Olympic hockey player",
    "ice hockey executive",
    "field hockey player",
    "field hockey coach",
    "ice hockey player",
    "hockey team owner",
    "ice hockey coach",
    "hockey player",
    "Hall of Fame coach",
    "cricket administrator",
    "test cricket umpire",
    "cricket team coach",
    "cricket player",
    "cricket umpire",
    "cricketer",
    "Norwich City record goalscorer",
    "goaltending coach",
    "cup winning coach",
    "Paralympic athlete",
    "Paralympic coach",
    'professional wrestler known as "Bad News Brown"',
    "professional wrestling ring",
    "professional wrestling manager",
    "world champion arm wrestler",
    "professional wrestler",
    "wrestling",
    "wrestling manager",
    "Olympic wrestler",
    "wrestling coach",
    "wrestler",
    "assistant coach",
    "executive coach",
    "Victorian coach",
    "baseball player Philadelphia Athletics",
    "baseball official scorer",
    "baseball team part owner",
    "baseball club owner",
    "baseball team owner",
    "baseball executive",
    "baseball player",
    "baseball coach",
    "pitching coach",
    "first woman to swim the Channel in both directions",
    "first Olympic swimming medallist",
    "first woman to swim the Channel",
    "bronze medal winning swimmer",
    "silver medal winning swimmer",
    "Olympic swimmer",
    "swimming coach",
    "swimmer",
    "strength coach",
    "national coach",
    "field athlete who won four gold medals at the Summer Olympics",
    "candidate for National Olympic Committee president",
    "second oldest national Olympic competitor",
    "Olympic silver medal winning pentathlete",
    "president of the Olympic Committee",
    "the nation first Olympic champion",
    "first Olympic swimming medallist",
    "Olympic gold medallist in discus",
    "the first Olympic gold medalist",
    "winner of the first Olympic m",
    "oldest living former Olympian",
    "Olympic long distance runner",
    "Olympic field hockey player",
    "three time Olympic medalist",
    "Olympic gold medal winner",
    "Olympic water polo player",
    "Olympic lightweight boxer",
    "oldest surviving Olympian",
    "Olympic silver medallist",
    "Olympic bronze medalist",
    "Olympic silver medalist",
    "Olympic sports shooter",
    "Olympic Games champion",
    "Olympic gold medalist",
    "Olympic sport shooter",
    "Olympic figure skater",
    "Olympic hockey player",
    "Olympic sharpshooter",
    "Olympic long jumper",
    "Olympic competitor",
    "Olympic ski jumper",
    "Olympic bobsledder",
    "Olympic medallist",
    "Olympic champion",
    "Olympic medalist",
    "Olympic wrestler",
    "Olympic sprinter",
    "Olympic Champion",
    "Olympic official",
    "Olympic swimmer",
    "Olympic shooter",
    "Olympic hurdler",
    "Olympic athlete",
    "Senior Olympian",
    "Olympic fencer",
    "field Olympian",
    "Olympic skier",
    "Olympic coach",
    "Olympiian",
    "Olympian",
    "Olympics",
    "college coach",
    "cycling advocate",
    "professional road bicycle racer",
    "professional cyclist",
    "road racing cyclist",
    "motorcycle racer",
    "cycling",
    "cycling advocate",
    "racing cyclist",
    "cycling coach",
    "road cyclist",
    "cyclist",
    "bronze medalist fencer",
    "fencing trainer",
    "Olympic fencer",
    "fencing master",
    "fencing coach",
    "foil fencer",
    "fencer",
    "tennis player",
    "tennis umpire",
    "tennis coach",
    "sports analyst",
    "sports media",
    "radio sports",
    "television sports",
    "Olympic sports shooter",
    "TF sports",
    "sports administrator",
    "sports team owner",
    "sports club owner",
    "sports executive",
    "sports official",
    "sports director",
    "sports shooter",
    "sports coach",
    "sportswoman",
    "sportsman",
    "underwater diver",
    "diving coach",
    "scuba diver",
    "diver",
    "javelin thrower",
    "discus thrower",
    "weight thrower",
    "rowing coach",
    "rower",
    "field athlete who won four gold medals at the Summer Olympics",
    "former chairman of Sheffield United",
    "field athlete",
    "field coach",
    "outfielder",
    "field team",
    "racetrack operator",
    "track athlete",
    "track coach",
    "first head coach of the women national team",
    "head coach",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "an authority on psychoanalysis",
    "bee breeding authority",
    "technical authority",
    "team member of the Manhattan Project",
    "scientific divulgator",
    "a pioneer in the field of vitro fertilization",
    "pioneer in the field of computer graphics",
    "field biologist",
    "former head of the Psychological Association",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "uniform supplier to the International Ice Hockey Federation",
    "owner of Ellen Tracy sportswear",
    "sportswear pioneer",
    "the one time head of the Gucci fashion house",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "one of the world leading authorities on book conservation",
    "leading authority on Melanesian culture",
    "authority on Francis Drake",
    "authority on alabaster",
    "foremost encyclopedist",
    "debate coach",
    "head teacher",
    "headmaster",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "division commander",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "the acting head of the UGCC",
    "head of the Sikh Dharma in the western hemisphere",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "founder of the Bandidos Motorcycle Club",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "ex wife of former football player O J Simpson",
    "championed by Edward R Murrow",
    "lung patient whose wrong transplant made headlines",
    "made national headlines",
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [252]:
# Dropping entry with link that points to event rather than individual's page
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/Death_of_Paolo_Gislimberti"
].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [253]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting Category from `info_2_1`

In [254]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 50.7 s
Wall time: 50.8 s


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,info_1_0,info_2_1,info_2_2,info_2_3,info_3_0,info_3_1,info_3_2,info_4_0,info_4_1,info_4_2,info_5_0,info_5_1,info_5_2,info_6_0,info_6_1,info_7_0,info_8_0,info_8_1,info_9_0,info_10_0,info_11_0,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
60105,25,Arnold Palmer,", 87, American Hall of Fame professional golfer.",https://en.wikipedia.org/wiki/Arnold_Palmer,71,2016,September,,87.0,,United States of America,,,4.276666,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,0,0,0,0,1
97628,10,Enrique Metinides,", 88, Mexican crime photographer.",https://en.wikipedia.org/wiki/Enrique_Metinides,7,2022,May,,88.0,,Mexico,,,2.079442,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [257]:
# Checking updated num_categories value counts
df["num_categories"].value_counts()

1    94696
2     3245
0       84
3       19
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- Most likely, the number of entries without any category will not update until we search `info_1` columns. 
- The remaining search of `info_2` and above will either result in redundant categorization or adding categories to entries that have at least 1 category already.
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Finding `known_for` Roles in `info_2_1`

In [259]:
# Obtaining values for column and their counts
roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "general" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,

}

#### Extracting Category from `info_2_1`

In [None]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

#### Checking Updated `num_categories` Value Counts

In [258]:
# # Checking updated num_categories value counts
# df["num_categories"].value_counts()

<IPython.core.display.Javascript object>

In [256]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Finding `known_for` Roles in `info_2_1`

In [None]:
# Obtaining values for column and their counts
roles_list = df["info_2_1"].value_counts(ascending=True).index.tolist()

In [None]:
# Code to check each value
roles_list.pop()

In [None]:
# # Create specific_roles_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_2_1"].notna()].index
#             if "general" in df.loc[index, "info_2_1"]
#         ],
#         "info_2_1",
#     ]
#     .value_counts()
#     .index.tolist()
# )

In [None]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

In [None]:
# # Example code to quick-check a specific entry
# df[df["info_2_1"] == "outlaw country music singer songwriter"]

#### Creating Lists for Each `known_for` Category

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,

}

#### Extracting Category from `info_2_1`

In [None]:
%%time

# Column to check
column = 'info_2_1'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

#### Checking Updated `num_categories` Value Counts

In [258]:
# # Checking updated num_categories value counts
# df["num_categories"].value_counts()

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` for the next iteration.

#### Observations:
- It is time to export our dataframe and start a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean5.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean5.db")
# df.to_sql("wp_life_expect_clean5", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part 6]()