# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 8](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean8_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_clean7.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean7.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean7", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98040 rows and 27 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,0,0,0,1,0,1,0,0,1,0,0,0,3


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98038,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,"2002 2007, since 2018",50.0,,Pakistan,,", since",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98039,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
64699,2,Shen Daren,", 89, Chinese politician, Communist Party Chief of Ningxia .",https://en.wikipedia.org/wiki/Shen_Daren,6,2017,August,1986 1989 and Jiangsu 1989 1993,89.0,,"China, People's Republic of",,and Jiangsu,1.94591,0,0,0,0,0,0,0,0,1,0,0,0,1
20748,20,William W. Howells,", 97, American anthropologist.",https://en.wikipedia.org/wiki/William_W._Howells,7,2005,December,,97.0,,United States of America,,,2.079442,0,0,0,1,0,0,0,0,0,0,0,0,1
93852,8,Pedro Feliciano,", 45, Puerto Rican baseball player .",https://en.wikipedia.org/wiki/Pedro_Feliciano,26,2021,November,"Mets, Fukuoka Daiei Hawks",45.0,,Puerto Rico,United States of America,"Mets, Fukuoka Daiei Hawks",3.295837,0,0,0,0,0,0,1,0,0,0,0,0,1
26423,12,David Foster Wallace,", 46, American author and essayist , suicide by hanging.",https://en.wikipedia.org/wiki/David_Foster_Wallace,82,2008,September,,46.0,suicide by hanging,United States of America,,,4.418841,0,0,0,0,0,1,0,0,0,0,0,0,1
56597,23,Nikolay Abramov,", 54, Russian Vepsian writer and translator.",https://en.wikipedia.org/wiki/Nikolay_Abramov_(writer),3,2016,January,,54.0,,Russia,,,1.386294,0,0,0,1,0,1,0,0,0,0,0,0,2


<IPython.core.display.Javascript object>

### Checking Data Types and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98040 entries, 0 to 98039
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98040 non-null  object 
 1   name                       98040 non-null  object 
 2   info                       98040 non-null  object 
 3   link                       98040 non-null  object 
 4   num_references             98040 non-null  int64  
 5   year                       98040 non-null  int64  
 6   month                      98040 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98040 non-null  float64
 9   cause_of_death             33336 non-null  object 
 10  place_1                    97887 non-null  object 
 11  place_2                    8116 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98040 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for and `cause_of_death` values.
- As all of the numbered `info_` columns have been searched and dropped, we are left with `info_parenth` (and its copy).  
- By definition, we would expect `info_parenth` to contain non-essential values.  The column contains a lot of values, so we will begin by looking only for `known_for` information for the few entries that do not yet have a `known_for` category.
- Then we can consider an approach to searching for any `cause_of_death` information in `info_parenth`.

### Extracting Remaining `known_for` for Entries Still Lacking a `known_for` Category

#### Checking Entries Lacking lacking `known_for` Category

In [6]:
# Checking entries with num_categories == 0
df[df["num_categories"] == 0]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
278,4,Aníbal,", 53, Mexican , brain cancer.",https://en.wikipedia.org/wiki/An%C3%ADbal_(wrestler),20,1994,March,professional wrestler,53.0,brain cancer,Mexico,,professional wrestler,3.044522,0,0,0,0,0,0,0,0,0,0,0,0,0
11490,10,Chandra Khonnokyoong,", 91, Thai .",https://en.wikipedia.org/wiki/Chandra_Khonnokyoong,25,2000,September,,91.0,,Thailand,,,3.258097,0,0,0,0,0,0,0,0,0,0,0,0,0
12052,3,Kung Fu,", 49, Mexican , arterial hyper tension.",https://en.wikipedia.org/wiki/Kung_Fu_(wrestler),11,2001,January,,49.0,arterial hyper tension,Mexico,,,2.484907,0,0,0,0,0,0,0,0,0,0,0,0,0
16376,10,Little Eva,", .",https://en.wikipedia.org/wiki/Little_Eva,14,2003,April,"née Eva Narcissus Boyd, , pop singer",59.0,,United States of America,,"née Eva Narcissus Boyd, , pop singer",2.70805,0,0,0,0,0,0,0,0,0,0,0,0,0
36930,12,Natalee Holloway,", 18",https://en.wikipedia.org/wiki/Natalee_Holloway,198,2012,January,"in 2005, student, missing since 2005 declared legally dead on this date",18.0,,United States of America,,"in , student, missing since declared legally dead on this date",5.293305,0,0,0,0,0,0,0,0,0,0,0,0,0
79603,27,Sudhakar Chaturvedi,", 122 .",https://en.wikipedia.org/wiki/Sudhakar_Chaturvedi,37,2020,February,"claimed, Vedic scholar and courier Mahatma Gandhi",122.0,,India,,"claimed, Vedic scholar and courier Mahatma Gandhi",3.637586,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

#### Observations:
- We can see some additional information in `info_parenth` for some of the values.
- Since we previously separated the information contained in parentheses from the original `info` column, we will maintain `info_parenth_copy` intact, and utilize `info_parenth` for any value extraction.
- We will hard-code the missing `known_for` info for the entries lacking that information, since there are only 2, and we have the link readily available to find it or it is apparent in the link value.  

#### Finding `known_for` Roles in `info_parenth_copy` for Entries Lacking any Category

In [7]:
# Obtaining values for column and their counts
roles_list = (
    df[df["num_categories"] == 0]["info_parenth_copy"]
    .value_counts(ascending=True)
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [8]:
# Code to check each value
value = roles_list.pop()
value

'claimed, Vedic scholar and courier Mahatma Gandhi'

<IPython.core.display.Javascript object>

In [9]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_parenth_copy"].notna()].index
#             if value in df.loc[index, "info_parenth_copy"]
#         ],
#         "info_parenth_copy",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [10]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [11]:
# # Example code to quick-check a specific entry
# df[df["info_parenth_copy"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [12]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = ["and courier Mahatma Gandhi"]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = ["née Eva Narcissus Boyd, , pop singer"]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = ["professional wrestler", "wrestler"]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = ["scholar"]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = ["claimed,  Vedic", "spiritual teacher"]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = ["in , student, missing since declared legally dead on this date"]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [13]:
# Hard-coding info_parenth_copy for entry lacking known_for values
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Chandra_Khonnokyoong"].index,
    "info_parenth_copy",
] = "spiritual teacher"


# Hard-coding info_parenth_copy for entry lacking known_for values
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Kung_Fu_(wrestler)"].index,
    "info_parenth_copy",
] = "wrestler"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [14]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories Values from `info_parenth_copy` for Entries without a Category

In [15]:
%%time

# Column to check
column = 'info_parenth_copy'

# Start dataframe
dataframe = df[(df[column].notna()) & (df['num_categories']==0)]
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 15.6 ms
Wall time: 13 ms


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [16]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84108
2    12805
3     1088
4       36
5        3
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- All entries now have at least one `known_for` category.
- Next, we will proceed to examine the values in `cause_of_death` to potentially guide finding that information in `info_parenth_copy` for entries that lack a value for it.

### Searching for Remaining `cause_of_death` Values in `info_parenth_copy`

In [17]:
# # Creating list of cause_of_death values
# cause_list = df["cause_of_death"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [18]:
# # Updating cause_list to contain only causes that are in info_parenth_copy values
# cause_list = [
#     item
#     for item in cause_list
#     if any(
#         item in value
#         for value in df[df["info_parenth_copy"].notna()]["info_parenth_copy"]
#     )
# ]

<IPython.core.display.Javascript object>

In [19]:
# # Checking the cause_of_death values starting with most frequent
# value = cause_list.pop()
# value

<IPython.core.display.Javascript object>

In [20]:
# # Creating list of info_parenth_copy values that contain cause_of_death value
# df.loc[
#     [
#         index
#         for index in df[df["info_parenth_copy"].notna()].index
#         if value in df.loc[index, "info_parenth_copy"]
#     ],
#     "info_parenth_copy",
# ].value_counts().index.tolist()

<IPython.core.display.Javascript object>

In [21]:
# # Checking specific entries
# df[
#     df["info_parenth_copy"]
#     == "1969 1974 and Foreign Affairs 1974 1982; 1982 1992, Vice Chancellor 1974 1982; 1982 1992"
# ]

<IPython.core.display.Javascript object>

#### Creating List for `cause_of_death`

In [22]:
# Creating list for cause_of_death
cause_of_death = [
    "cancer",
    "pancreatic cancer",
    "adrenal cancer",
    "endometrial cancer",
    "nasopharynx cancer",
    "parotid cancer",
    "prostate cancer",
    "multiple myeloma, blood cancer",
    "bowel cancer",
    "oesophageal cancer",
    "liver cancer",
    "lung cancer",
    "cancer",
    "breast cancer",
    "testicular cancer",
    "ovarian cancer",
    "peritoneal cancer",
    "heart attack",
    "COVID",
    "congestive heart failure",
    "heart failure",
    "ischemic heart failure",
    "pneumonia",
    "AIDS, pneumonia",
    "pneumonia, infarctions",
    "bronchial pneumonia",
    "stroke",
    "heat stroke",
    "shot",
    "gunshot wounds",
    "traffic collision",
    "natural causes disease",
    "natural causes",
    "suicide",
    "suspected suicide",
    "suicide by drowning",
    "suicide by hydrogen sulfide",
    "suicide by hanging",
    "Alzheimer disease",
    "leukemia",
    "Parkinson disease",
    "Parkinson’s disease",
    "Creutzfeldt Jakob disease",
    "kidney disease",
    "Pick disease",
    "heart disease",
    "car accident",
    "injuries due to a fall",
    "fall",
    "subdural hematoma, fall",
    "multiple organ failure",
    "AIDS, lymphoma",
    "Hodgkin lymphoma",
    "gastric lymphoma",
    "plane crash",
    "amyotrophic lateral sclerosis",
    "euthanized",
    "uveal melanoma",
    "emphysema",
    "pulmonary emphysema",
    "emphysema, bronchitis",
    "Lewy body dementia",
    "renal failure",
    "intracerebral hemorrhage",
    "liver failure",
    "pulmonary embolism",
    "homicide",
    "pulmonary fibrosis",
    "idiopathic pulmonary fibrosis",
    "abdominal aortic aneurysm",
    "sepsis",
    "glioblastoma multiforme",
    "Jordanian bombings",
    "accidental shooting",
    "pulmonary edema",
    "septic infection",
    "myelodysplastic syndrome",
    "locked in syndrome",
    "multiple organ dysfunction syndrome",
    "superior vena cava syndrome",
    "Marfan syndrome",
    "Guillain Barré syndrome",
    "multiple sclerosis",
    "AIDS",
    "multiple organ failure",
    "pulmonary emphysema",
    "emphysema",
    "emphysema, bronchitis",
    "aortic dissection",
    "progressive supranuclear palsy",
    "Hodgkin lymphoma",
    "COPD",
    "pancreatitis",
    "cerebral haemorrhage",
    "ALS",
    "AL amyloidosis",
    "car accident",
    "accidental shooting",
    "epilepsy",
    "dilated cardiomyopathy",
    "thrombosis",
    "rheumatoid arthritis",
    "beheading",
    "leiomyosarcoma",
    "Ewing sarcoma",
    "sarcoma",
    "leptomeningeal carcinomatosis",
    "nasopharyngeal carcinoma",
    "small cell carcinoma",
    "myelodysplasia",
    "pulmonary embolism",
    "embolism",
    "suffocated",
    "cerebral haemorrhage",
    "assassination",
    "gastrointestinal hemorrhage",
    "intracerebral hemorrhage",
    "anaphylaxis",
    "progressive supranuclear palsy",
    "shelling",
    "pulmonary edema",
    "Jordanian bombings",
    "posterior cortical atrophy",
    "emphysema, bronchitis",
    "West Nile virus",
    "corticobasal degeneration",
    "heat stroke",
    "glioblastoma multiforme",
    "acute endocarditis",
    "arrhythmogenic right ventricular dysplasia",
    "alcoholism",
    "plane crash",
    "normal pressure hydrocephalus",
    "primary progressive aphasia",
    "dilated cardiomyopathy",
    "subdural haematoma",
    "arrhythmia",
    "thrombus",
    "thrombosis",
    "essential thrombocytosis",
    "thrombotic thrombocytopenic purpura",
    "vasculitis",
    "self defenestration",
    "ventricular tachycardia",
]

# Clearing out duplicate values and sorting in descending length order to use for extracting values
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [23]:
# Dropping info_parenth_copy value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth_copy"].notna()].index
        if "breaststroke" in df.loc[index, "info_parenth_copy"]
        or "backstroke" in df.loc[index, "info_parenth_copy"]
    ],
    "info_parenth_copy",
] = ""

# Dropping info_parenth_copy value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth_copy"].notna()].index
        if "shot put" in df.loc[index, "info_parenth_copy"]
        or "Aldershot" in df.loc[index, "info_parenth_copy"]
    ],
    "info_parenth_copy",
] = ""

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[
    df[df["info_parenth_copy"] == "fallout shelter sign"].index, "info_parenth_copy"
] = ""

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[
    df[
        df["info_parenth_copy"]
        == "HIV, President of the International AIDS Society 1994 1998"
    ].index,
    "info_parenth_copy",
] = ""

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[
    df[df["info_parenth_copy"] == "assassination of Orlando Letelier"].index,
    "info_parenth_copy",
] = ""

# Dropping info_parenth_copy value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth_copy"].notna()].index
        if "Suicide" in df.loc[index, "info_parenth_copy"]
    ],
    "info_parenth_copy",
] = ""

<IPython.core.display.Javascript object>

#### Extracting `cause_of_death` Values from `info_parenth_copy`

In [24]:
%%time

# Column to search
column = "info_parenth_copy"

# Dataframe to search
dataframe = df[df[column].notna()]

# For loop to extract cause from column to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, "").strip()
                else:
                    df.loc[index, "cause_of_death"] = cause
                    df.loc[index, column] = item.replace(cause, "").strip()

# Checking number of cause_of_death values
print(
    f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n'
)

There are 33461 values in cause_of_death column.

CPU times: total: 27.6 s
Wall time: 27.6 s


<IPython.core.display.Javascript object>

#### Observations:
- We extracted ~130 values to `cause_of_death` with our last search.
- There are additional category values in `info_parenth_copy` that were not previously captured.  The challenge of searching this column is that it has a very high proportion of unique values, so the cost of capturing the additional may be too high.
- Let us attempt to narrow the search by restricting it to most frequent key words, such as "MP", etc., then only search `info_parenth_copy` values for them for entries that do not already have the associated category.

### Search of `info_parenth_copy` for Additional `known_for` Categories with Constraints

#### Checking Initial Value Counts for `info_parenth_copy`

In [25]:
# Checking info_parenth_copy initial value counts
df["info_parenth_copy"].value_counts()

                                           12570
, ,                                         3293
,                                           1999
since                                       1202
national team                                167
                                           ...  
Silver Oak Cellars                             1
, Foreign Secretary                            1
UCLA, Milwaukee Bucks                          1
Brownsville Station                            1
and minister of industry and technology        1
Name: info_parenth_copy, Length: 13248, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- Almost 1/3 of `info_parenth_copy` values are unique, so we will aim to streamline the search for information that will add a new category to an entry, but not take excessive time.  By definition, information in parentheses is anticipated to add detail that is non-essential to the primary information.

#### Function to Save Indices of Rows Matching Regular Expressions Pattern to a List and Print Number of Rows with Match

In [26]:
# Define a function that takes dataframe, column name, and re pattern as arguments and returns list of indices
# for which column value matches re pattern
def rows_with_pattern(dataframe, column, pattern):
    """
    Takes input of dataframe, column name, and re pattern 
    and returns list of indices for rows that contain match
    for pattern anywhere within value for given column.
    
    dataframe: dataframe
    column: column name
    pattern: re pattern
    """
    index_list = []

    for i in dataframe.index:
        item = dataframe.loc[i, column]
        match = re.search(pattern, item)
        if match:
            index_list.append(i)
    print(
        f"There are {len(index_list)} rows with matching pattern in column '{column}'."
    )
    return index_list

<IPython.core.display.Javascript object>

#### Function to Use rows_with_pattern Function for Multiple Regular Expression Patterns

In [27]:
# Define a function that calls rows_with_pattern function for multiple re patterns
# returning a single list of indices for all rows with any pattern match


def multiple_patterns(dataframe, column, patterns):
    """
    Takes input dataframe, column, and list of re patterns and returns single list 
    of indices for rows in which a match for any pattern is found with re.search
    
    dataframe: dataframe
    column: column name
    patterns: list of re patterns
    """
    rows_combined = []

    # For loop to check each pattern
    for pattern in patterns:

        # List and number of rows matching each pattern
        print(pattern)
        rows_to_check = rows_with_pattern(dataframe, column, pattern)
        print("")

        # Add list for each pattern to combined list
        rows_combined += rows_to_check

    return rows_combined

<IPython.core.display.Javascript object>

#### Checking a Sample of `info_parenth_copy` Unique Values

In [28]:
# Checking a sample of info_parenth_copy Unique Values
pd.Series(df["info_parenth_copy"].value_counts().index.tolist()).sample(100)

593                                                                                Bangkok Bank
181                                                                                         BBC
2632                                                                         Accrington Stanley
7654                                                                    , nominal head of GKChP
6804                                             Argeș Pitești, Steaua București, national team
9491                                              Simon Property Group, producer Indiana Pacers
3955                                                                                  Accenture
1873                                                                        The Jazz Messengers
4438                                                              Guild of Film Critics, winner
5703                                                                           Hapoel Kfar Saba
9401                                    

<IPython.core.display.Javascript object>

#### Observations:
- We can see that many of the values are proper nouns of places, people, or titles (some in quotations).  
- First, we can drop the titles in quotations using regular expressions.
-  Then we will take an approach of combining all of the values into a single list, then converting to a single string, then back to a list, to reduce then reduce to a set of individual word values that can be prioritized.

#### Checking and Dropping Titles in Quotations from `info_parenth_copy`

In [29]:
# Column to check
column = "info_parenth_copy"

# Dataframe to check
dataframe = df[df[column].notna()]

# Patterns for re
pattern = f'".*"'

# Finding indices of rows that do and do not have pattern
rows_to_check = rows_with_pattern(dataframe, column, pattern)

# Checking a sample of rows
df.loc[rows_to_check, :].sample(2)

There are 676 rows with matching pattern in column 'info_parenth_copy'.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
51474,8,Curtis Lee,", 75, American singer , cancer.",https://en.wikipedia.org/wiki/Curtis_Lee,6,2015,January,"""Pretty Little Angel Eyes""",75.0,cancer,United States of America,,"""Pretty Little Angel Eyes""",1.94591,0,0,0,0,0,1,0,0,0,0,0,0,1
81524,9,Little Richard,", 87, American Hall of Fame rock and roll singer , pianist and songwriter, bone cancer.",https://en.wikipedia.org/wiki/Little_Richard,291,2020,May,"""Tutti Frutti"", ""Long Tall Sally"", ""Lucille""",87.0,bone cancer,United States of America,,"""Tutti Frutti"", ""Long Tall Sally"", ""Lucille""",5.676754,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [30]:
# For loop to extract quotations and characters within from info_parenth_copy
for index in rows_to_check:
    item = df.loc[index, column]
    match = re.search(pattern, item)
    if match:
        df.loc[index, column] = re.sub(pattern, "", df.loc[index, column]).strip()

# Recheck a sample of treated rows
df.loc[rows_to_check, :].sample(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
60563,30,Curly Putman,", 85, American songwriter .",https://en.wikipedia.org/wiki/Curly_Putman,16,2016,October,"""Green, Green Grass of Home"", ""D I V O R C E"", ""He Stopped Loving Her Today""",85.0,,United States of America,,,2.833213,0,0,0,0,0,1,0,0,0,0,0,0,1
59868,7,Clifford Curry,", 79, American beach music and R&B singer .",https://en.wikipedia.org/wiki/Clifford_Curry,10,2016,September,"""She Shot a Hole in My Soul""",79.0,,United States of America,,,2.397895,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [31]:
# Rechecking info_parenth_copy value counts
df["info_parenth_copy"].value_counts()

                                            13061
, ,                                          3293
,                                            2000
since                                        1202
national team                                 167
                                            ...  
, President of the Governing Council            1
Gloucestershire, Worcestershire,                1
North Sydney, Eastern Suburbs, New South        1
neuroendocrine, ,                               1
and minister of industry and technology         1
Name: info_parenth_copy, Length: 12723, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- Dropping song and other titles in quotations simplified the remaining values.
- Next, we will create an abbreviated single list of all of the values, then use the values to extract additional categories.

#### Extracting Additional `known_for` Category from `info_parenth_copy` Using `roles_list`

In [747]:
## Combining `info_parenth_copy` Values into a Single List of Unique Values for Searching

# Creating a single list of info_parenth_copy values
roles_list = df["info_parenth_copy"].value_counts().index.tolist()

# Converting to a single string and removing commas, semicolons, and extra whitespace
roles_list = (
    " ".join(roles_list).replace(",", "").replace(";", "").replace("  ", " ").strip()
)

# Splitting into a list of individual words and converting to a Series to easily check value counts
roles_list = roles_list.split()

# Converting to a series for value_counts in ascending order for use of pop() on most frequent values first
# and dropping obvious extraneous values
roles_list = (
    pd.Series(roles_list)
    .value_counts(ascending=True)
    .drop(
        [
            "and",
            "of",
            "the",
            "The",
            "since",
            "on",
            "nd",
            "th",
            "for",
            "to",
            "&",
            "de",
            "winner",
            "in",
            "at",
            "this",
        ]
    )
)

# Dropping values that occur fewer than 3 times
roles_list = roles_list[roles_list > 2]

# Converting back to list
roles_list = roles_list.index.tolist()

print(f"There are {len(roles_list)} remaining unique individual words in roles_list.\n")

There are 2497 remaining unique individual words in roles_list.



<IPython.core.display.Javascript object>

In [1104]:
# # Example code to check each value in roles_list in descending order of frequency
# value = roles_list.pop()
# value

<IPython.core.display.Javascript object>

In [1096]:
# # Create specific_roles_cause_list for above popped value
# # only checking entries not already in category associated with popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[
#                 (df["info_parenth_copy"].notna())
#                 & (df["politics_govt_law"] == 0)
#                 #                 & (df["law_enf_military_operator"] == 0)
#                 #                 & (df["spiritual"] == 0)
#                 #                 & (df["sports"] == 0)
#                 #                 & (df["academia_humanities"] == 0)
#                 #                 & (df["arts"] == 0)
#                 #                 & (df["business_farming"] == 0)
#             ].index
#             if value in df.loc[index, "info_parenth_copy"]
#         ],
#         "info_parenth_copy",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1097]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1098]:
# # Checking individua entries as needed
# df[df[column] == "The Cardinals"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [1099]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = [
    "Middletown and politician, Senator and Deputy First Minister",
    "Deputy Minister of Foreign Affairs and politician,",
    ", Minister of Military Production",
    "and Minister of Education",
    "and politician, member of the Arizona House of Representatives and Senate",
    "and politician, member of the Tennessee House of Representatives",
    "and politician, member of the House of Representatives",
    "House of Representatives and politician, member of the Senate and",
    "Dewan Negara, director and politician, member of the",
    "Landtag of Bavaria, , and politician, member of the",
    "House of Councillors and singer, member of the",
    "Sejm, member of the",
    "John Paul II Catholic University of Lublin and member of the Senate",
    "Michigan Wolverines, Eastern Michigan Eagles and politician, member of the Michigan House of Representatives",
    "Detroit Tigers, Philadelphia Phillies and politician, member of the House of Representatives and Senator",
    "Montreal Alouettes, Duke Blue Devils, Baltimore Colts and politician, member of the Maryland Senate",
    "Winnipeg Blue Bombers and politician, member of the Washington House of Representatives and Senate",
    "San Francisco ers and politician, member of the State Assembly and County Board of Supervisors",
    "Tennessee Volunteers and politician, member of the House of Representatives and Senate",
    "Detroit Lions, businessman and politician, member of the House of Representatives",
    "St Louis Cardinals, Dolphins and politician, member of the Nebraska Legislature",
    "Giants and politician, member of the Minnesota House of Representatives",
    "WBRE TV and politician, member of the House of Representatives since",
    "Augusta National Golf Club, member of the House of Representatives",
    "Oglethorpe and politician, member of the House of Representatives",
    "Oakland Raiders and politician, member of the Minnesota Senate",
    "Gators and politician, member of the House of Representatives",
    "Cleveland Browns and politician, member of the Ohio Senate",
    "and member of the House of Lords since",
    "Random House and human rights activist Helsinki Watch",
    "and member of the House of Lords since",
    "Royal Opera House, and conductor",
    "Parliament House, Canberra",
    "Sydney Opera House",
    "Cairo Opera House",
    "Ballymaloe House",
    "Crowded House",
    "Random House",
    "Feral House",
    "Chaos theory, Chief Scientific Advisor to the Government",
    "Journalists' Union of the Athens Daily Newspapers and politician, MEP and Vice President",
    "National Rifle Association of America, Oscar, , President of the , winner",
    "Pulitzer Prize for Presidential Medal of Freedom, Order of the South,",
    "caretaker minister of information & broadcasting and politician,",
    "minister of culture, , and politician,",
    "minister of culture, ,",
    "Toronto Maple Leafs, commentator MP and politician, Stanley Cup",
    "MP and politician, Liberal People Party , and leader of the",
    "Detroit Red Wings, Toronto Maple Leafs and politician, MP",
    "Venizelos SA and politician, MP , and Deputy Speaker",
    "Interacting boson model and politician, MP",
    "MP, human rights activist and politician,",
    "WF and politician, MP for Stirling",
    "MP, civic activist and politician,",
    ", , economist, and politician, MP",
    "LRT televizija and politician, MP",
    "MP, , , composer and politician,",
    "Partex Group and politician, MP",
    "MP, MEP, and politician, , , ,",
    "MLA, MP, and politician, and",
    ", coach and politician, MP",
    "MP, and politician, since",
    "Rustavi Ensemble, MP",
    "MP, and politician,",
    "MP and politician,",
    "and politician, MP",
    "MP, , ,",
    ", MP, ,",
    "Chicago Cardinals St Louis Cardinals and politician, mayor of Starkville,",
    "Cleveland Browns, Baltimore Ravens and politician, mayor of Tangipahoa,",
    "Brainerd International Raceway and politician, mayor of Flint, Michigan",
    "Lézignan, national team and politician, mayor of Lézignan Corbières",
    "PGA Tour, Tour and politician, mayor of Villa Allende since",
    "and politician, mayor of Ouray, Colorado",
]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = [
    "Minister Frederick Gray in the James Bond films",
    "WCGA and TV WTVC broadcaster",
    "The Wrecking Crew and original member of Herb Alpert Tijuana Brass",
    "Agence Presse, member",
    "Rams, member of Pro Football Hall of Fame, and actor ,",
    "Sydney Opera House",
    "News Corporation, President of the Academy of Television Arts & Sciences since",
    "Royal Court Theatre",
]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = [
    "and sports team owner Islanders",
    "and rugby union player national team, Wellington",
    "and owner of the New Patriots football team",
    "and sports team owner Dolphins, Panthers",
    "and sports team owner Orlando Magic",
    "and Baseball team owner Mets",
    "and college football coach Columbia University",
    "ThyssenKrupp, member of IOC",
    "Plugged Nickle, member of National Museum of Racing and Hall of Fame",
    "FC Köln, member of World Cup winning team",
    "since and member of FIFA Council since",
    "TVS, CBS Sports, Sportsvision and baseball Chicago White Sox",
    "and baseball player Memphis Red Sox",
    "baseball beat and San Francisco Giants writer J G Taylor Spink Award, recipient of the",
    "Yankees, Giants",
    ", Senator for Minnesota , Olympic silver medalist in ice hockey",
    ", and Olympic eventing chef d'équipe ,",
    ", President of Olympic Committee",
    ", Olympic speed skater",
    ", and Olympic swimmer ,",
    "Olympic medallist,",
    "Little Caesars, Detroit Red Wings, Detroit Tigers",
]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = [
    "operated on President John Fitzgerald Kennedy and Lee Harvey Oswald",
]
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = [
    "Uni President Enterprises Corporation death announced on this date",
    "and Air , President of Delta Air Lines",
]
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = [
    "Northern Arizona University",
    "and President of Royal Society",
    "University of Chicago, President of Physical Society",
    "Graham number, President of the Mathematical Society",
    "HIV, President of the International  Society",
    "National Gypsum and academic administrator, President of the UNC",
    "Clemson Tigers and academic administrator, President of Clemson University",
]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = [
    "People Army, Minister of Defence",
    ", first Defence Minister of ia",
    "and Minister of Defence of",
    "and Minister of Defense",
    "Flying Tigers",
]
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = [
    "and dean of the College of Cardinals",
]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = [
    "'Manson Family' member",
]
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = []
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [1100]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_parenth_copy`

In [1102]:
%%time

# Column to check
column = 'info_parenth_copy'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

There are 33461 values in cause_of_death column.

CPU times: total: 26 s
Wall time: 26.1 s


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [1103]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84009
2    12896
3     1096
4       36
5        3
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- We will proceed to rebuild `known_for_dict` and `cause_of_death` for the next iteration.

In [None]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

#### Extracting Additional `known_for` Category from `info_parenth_copy` Using `roles_list`

In [747]:
## Combining `info_parenth_copy` Values into a Single List of Unique Values for Searching

# Creating a single list of info_parenth_copy values
roles_list = df["info_parenth_copy"].value_counts().index.tolist()

# Converting to a single string and removing commas, semicolons, and extra whitespace
roles_list = (
    " ".join(roles_list).replace(",", "").replace(";", "").replace("  ", " ").strip()
)

# Splitting into a list of individual words and converting to a Series to easily check value counts
roles_list = roles_list.split()

# Converting to a series for value_counts in ascending order for use of pop() on most frequent values first
# and dropping obvious extraneous values
roles_list = (
    pd.Series(roles_list)
    .value_counts(ascending=True)
    .drop(
        [
            "and",
            "of",
            "the",
            "The",
            "since",
            "on",
            "nd",
            "th",
            "for",
            "to",
            "&",
            "de",
            "winner",
            "in",
            "at",
            "this",
        ]
    )
)

# Dropping values that occur fewer than 3 times
roles_list = roles_list[roles_list > 2]

# Converting back to list
roles_list = roles_list.index.tolist()

print(f"There are {len(roles_list)} remaining unique individual words in roles_list.\n")

There are 2497 remaining unique individual words in roles_list.



<IPython.core.display.Javascript object>

In [1104]:
# # Example code to check each value in roles_list in descending order of frequency
# value = roles_list.pop()
# value

<IPython.core.display.Javascript object>

In [1096]:
# # Create specific_roles_cause_list for above popped value
# # only checking entries not already in category associated with popped value
# specific_roles_cause_list = (
#     df.loc[
#         [
#             index
#             for index in df[
#                 (df["info_parenth_copy"].notna())
#                 & (df["politics_govt_law"] == 0)
#                 #                 & (df["law_enf_military_operator"] == 0)
#                 #                 & (df["spiritual"] == 0)
#                 #                 & (df["sports"] == 0)
#                 #                 & (df["academia_humanities"] == 0)
#                 #                 & (df["arts"] == 0)
#                 #                 & (df["business_farming"] == 0)
#             ].index
#             if value in df.loc[index, "info_parenth_copy"]
#         ],
#         "info_parenth_copy",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [1097]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_cause_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [1098]:
# # Checking individua entries as needed
# df[df[column] == "The Cardinals"]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category and for `cause_of_death`

In [None]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = []
politics_govt_law = sorted(list(set(politics_govt_law)), key=lambda x: len(x), reverse=True)  

arts = []
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)  

sports = []
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True) 

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True) 

business_farming = []
business_farming = sorted(list(set(business_farming)), key=lambda x: len(x), reverse=True)  

academia_humanities = []
academia_humanities = sorted(list(set(academia_humanities)), key=lambda x: len(x), reverse=True)  

law_enf_military_operator = []
law_enf_military_operator = sorted(list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True)  

spiritual = []
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)  

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)  

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)  

event_record_other = []
event_record_other = sorted(list(set(event_record_other)), key=lambda x: len(x), reverse=True)  

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True) 

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

#### Creating known_for_dict Dictionary of Category Keys and Specific Role Lists of Values

In [None]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,

}

#### Extracting `known_for` Categories and `cause_of_death` Values from `info_parenth_copy`

In [None]:
%%time

# Column to check
column = 'info_parenth_copy'

# Start dataframe
dataframe = df[df[column].notna()]

# For loop to find cause in column and extract it to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                else:
                    df.loc[index, 'cause_of_death'] = cause
                    df.loc[index, column] = item.replace(cause, '').strip()
                
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

# Checking number of cause_of_death values
print(f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n')

#### Checking Updated num_categories Value Counts

In [None]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

#### Observations:
- We will proceed to rebuild known_for_dict and cause_of_death for the next iteration.

#### Verifying that Values in info_3_0 Are Exhausted

In [None]:
# # Verifying that `info_3_0` is exhausted
# df["info_3_0"].value_counts()

#### Dropping info_3_0

In [None]:
# # Dropping info_3_0
# df.drop("info_3_0", axis=1, inplace=True)

# # Checking sample
# df.sample()

#### Observations:
- Our search of column info_3_0 is finished and have dropped that column.
- We will now save our dataset and pick back up in a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean8.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean8.db")
# df.to_sql("wp_life_expect_clean8", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part ]()