# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 8](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean8_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_clean7.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean7.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean7", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98040 rows and 27 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,0,0,0,1,0,1,0,0,1,0,0,0,3


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98038,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98039,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
19918,27,Frederick G. Dutton,", 82, American lawyer, advisor to President Kennedy.",https://en.wikipedia.org/wiki/Fred_Dutton,8,2005,June,,82.0,,United States of America,,,2.197225,0,0,0,0,0,0,0,0,1,0,0,0,1
54732,7,Susan Allen,", 64, American harpist, brain cancer.",https://en.wikipedia.org/wiki/Susan_Allen_(musician),36,2015,September,,64.0,brain cancer,United States of America,,,3.610918,0,0,0,0,0,1,0,0,0,0,0,0,1
29031,14,Ted Kennedy,", 83, Canadian hockey player , heart failure.",https://en.wikipedia.org/wiki/Ted_Kennedy_(ice_hockey),242,2009,August,Toronto Maple Leafs,83.0,heart failure,Canada,,Toronto Maple Leafs,5.493061,0,0,0,0,0,0,1,0,0,0,0,0,1
64832,11,Segun Bucknor,", 71, Nigerian musician, complications from multiple strokes.",https://en.wikipedia.org/wiki/Segun_Bucknor,23,2017,August,,71.0,complications from multiple strokes,Nigeria,,,3.178054,0,0,0,0,0,1,0,0,0,0,0,0,1
33285,23,Gordon P. Allen,", 81, American politician.",https://en.wikipedia.org/wiki/Gordon_P._Allen,4,2010,December,,81.0,,United States of America,,,1.609438,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98040 entries, 0 to 98039
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98040 non-null  object 
 1   name                       98040 non-null  object 
 2   info                       98040 non-null  object 
 3   link                       98040 non-null  object 
 4   num_references             98040 non-null  int64  
 5   year                       98040 non-null  int64  
 6   month                      98040 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98040 non-null  float64
 9   cause_of_death             33336 non-null  object 
 10  place_1                    97887 non-null  object 
 11  place_2                    8116 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98040 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for and `cause_of_death` values.
- As all of the numbered `info_` columns have been searched and dropped, we are left with `info_parenth` (and its copy).  
- By definition, we would expect `info_parenth` to contain non-essential values.  The column contains a lot of values, so we will begin by looking only for `known_for` information for the few entries that do not yet have a `known_for` category.
- Then we can consider an approach to searching for any `cause_of_death` information in `info_parenth`.

### Extracting Remaining `known_for` for Entries Still Lacking a `known_for` Category

#### Checking Entries Lacking lacking `known_for` Category

In [6]:
# Checking entries with num_categories == 0
df[df["num_categories"] == 0]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
278,4,Aníbal,", 53, Mexican , brain cancer.",https://en.wikipedia.org/wiki/An%C3%ADbal_(wrestler),20,1994,March,professional wrestler,53.0,brain cancer,Mexico,,professional wrestler,3.044522,0,0,0,0,0,0,0,0,0,0,0,0,0
11490,10,Chandra Khonnokyoong,", 91, Thai .",https://en.wikipedia.org/wiki/Chandra_Khonnokyoong,25,2000,September,,91.0,,Thailand,,,3.258097,0,0,0,0,0,0,0,0,0,0,0,0,0
12052,3,Kung Fu,", 49, Mexican , arterial hyper tension.",https://en.wikipedia.org/wiki/Kung_Fu_(wrestler),11,2001,January,,49.0,arterial hyper tension,Mexico,,,2.484907,0,0,0,0,0,0,0,0,0,0,0,0,0
16376,10,Little Eva,", .",https://en.wikipedia.org/wiki/Little_Eva,14,2003,April,"née Eva Narcissus Boyd, , pop singer",59.0,,United States of America,,"née Eva Narcissus Boyd, , pop singer",2.70805,0,0,0,0,0,0,0,0,0,0,0,0,0
36930,12,Natalee Holloway,", 18",https://en.wikipedia.org/wiki/Natalee_Holloway,198,2012,January,"in , student, missing since declared legally dead on this date",18.0,,United States of America,,"in 2005, student, missing since 2005 declared legally dead on this date",5.293305,0,0,0,0,0,0,0,0,0,0,0,0,0
79603,27,Sudhakar Chaturvedi,", 122 .",https://en.wikipedia.org/wiki/Sudhakar_Chaturvedi,37,2020,February,"claimed, Vedic scholar and courier Mahatma Gandhi",122.0,,India,,"claimed, Vedic scholar and courier Mahatma Gandhi",3.637586,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

#### Observations:
- We can see some additional information in `info_parenth` for some of the values.
- Since we previously separated the information contained in parentheses from the original `info` column, we will maintain `info_parenth_copy` intact, and utilize `info_parenth` for any value extraction.
- We will hard-code the missing `known_for` info for the entries lacking that information, since there are only 2, and we have the link readily available to find it or it is apparent in the link value.  

#### Finding `known_for` Roles in `info_parenth` for Entries Lacking any Category

In [7]:
# Obtaining values for column and their counts
roles_list = (
    df[df["num_categories"] == 0]["info_parenth"]
    .value_counts(ascending=True)
    .index.tolist()
)

<IPython.core.display.Javascript object>

In [8]:
# Code to check each value
value = roles_list.pop()
value

'claimed, Vedic scholar and courier Mahatma Gandhi'

<IPython.core.display.Javascript object>

In [9]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_parenth"].notna()].index
#             if value in df.loc[index, "info_parenth"]
#         ],
#         "info_parenth",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [10]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [11]:
# # Example code to quick-check a specific entry
# df[df["info_parenth"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [12]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = ["and courier Mahatma Gandhi"]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = ["née Eva Narcissus Boyd, , pop singer"]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = ["professional wrestler", "wrestler"]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = ["scholar"]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = ["claimed,  Vedic", "spiritual teacher"]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = ["in , student, missing since declared legally dead on this date"]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [13]:
# Hard-coding info_parenth for entry lacking known_for values
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Chandra_Khonnokyoong"].index,
    "info_parenth",
] = "spiritual teacher"


# Hard-coding info_parenth for entry lacking known_for values
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Kung_Fu_(wrestler)"].index,
    "info_parenth",
] = "wrestler"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [14]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories Values from `info_parenth` for Entries without a Category

In [15]:
%%time

# Column to check
column = 'info_parenth'

# Start dataframe
dataframe = df[(df[column].notna()) & (df['num_categories']==0)]
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 15.6 ms
Wall time: 12 ms


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [16]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84108
2    12805
3     1088
4       36
5        3
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- All entries now have at least one `known_for` category.
- Next, we will proceed to examine the values in `cause_of_death` to potentially guide finding that information in `info_parenth` for entries that lack a value for it.

### Searching for Remaining `cause_of_death` Values in `info_parenth`

In [17]:
# # Creating list of cause_of_death values
# cause_list = df["cause_of_death"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [18]:
# # Updating cause_list to contain only causes that are in info_parenth values
# cause_list = [
#     item
#     for item in cause_list
#     if any(
#         item in value
#         for value in df[df["info_parenth"].notna()]["info_parenth"]
#     )
# ]

<IPython.core.display.Javascript object>

In [19]:
# # Checking the cause_of_death values starting with most frequent
# value = cause_list.pop()
# value

<IPython.core.display.Javascript object>

In [20]:
# # Creating list of info_parenth values that contain cause_of_death value
# df.loc[
#     [
#         index
#         for index in df[df["info_parenth"].notna()].index
#         if value in df.loc[index, "info_parenth"]
#     ],
#     "info_parenth",
# ].value_counts().index.tolist()

<IPython.core.display.Javascript object>

In [27]:
# # Checking specific entries
# df[
#     df["info_parenth"]
#     == "1969 1974 and Foreign Affairs 1974 1982; 1982 1992, Vice Chancellor 1974 1982; 1982 1992"
# ]

<IPython.core.display.Javascript object>

#### Creating List for `cause_of_death`

In [22]:
# Creating list for cause_of_death
cause_of_death = [
    "cancer",
    "pancreatic cancer",
    "adrenal cancer",
    "endometrial cancer",
    "nasopharynx cancer",
    "parotid cancer",
    "prostate cancer",
    "multiple myeloma, blood cancer",
    "bowel cancer",
    "oesophageal cancer",
    "liver cancer",
    "lung cancer",
    "cancer",
    "breast cancer",
    "testicular cancer",
    "ovarian cancer",
    "peritoneal cancer",
    "heart attack",
    "COVID",
    "congestive heart failure",
    "heart failure",
    "ischemic heart failure",
    "pneumonia",
    "AIDS, pneumonia",
    "pneumonia, infarctions",
    "bronchial pneumonia",
    "stroke",
    "heat stroke",
    "shot",
    "gunshot wounds",
    "traffic collision",
    "natural causes disease",
    "natural causes",
    "suicide",
    "suspected suicide",
    "suicide by drowning",
    "suicide by hydrogen sulfide",
    "suicide by hanging",
    "Alzheimer disease",
    "leukemia",
    "Parkinson disease",
    "Parkinson’s disease",
    "Creutzfeldt Jakob disease",
    "kidney disease",
    "Pick disease",
    "heart disease",
    "car accident",
    "injuries due to a fall",
    "fall",
    "subdural hematoma, fall",
    "multiple organ failure",
    "AIDS, lymphoma",
    "Hodgkin lymphoma",
    "gastric lymphoma",
    "plane crash",
    "amyotrophic lateral sclerosis",
    "euthanized",
    "uveal melanoma",
    "emphysema",
    "pulmonary emphysema",
    "emphysema, bronchitis",
    "Lewy body dementia",
    "renal failure",
    "intracerebral hemorrhage",
    "liver failure",
    "pulmonary embolism",
    "homicide",
    "pulmonary fibrosis",
    "idiopathic pulmonary fibrosis",
    "abdominal aortic aneurysm",
    "sepsis",
    "glioblastoma multiforme",
    "Jordanian bombings",
    "accidental shooting",
    "pulmonary edema",
    "septic infection",
    "myelodysplastic syndrome",
    "locked in syndrome",
    "multiple organ dysfunction syndrome",
    "superior vena cava syndrome",
    "Marfan syndrome",
    "Guillain Barré syndrome",
    "multiple sclerosis",
    "AIDS",
    "multiple organ failure",
    "pulmonary emphysema",
    "emphysema",
    "emphysema, bronchitis",
    "aortic dissection",
    "progressive supranuclear palsy",
    "Hodgkin lymphoma",
    "COPD",
    "pancreatitis",
    "cerebral haemorrhage",
    "ALS",
    "AL amyloidosis",
    "car accident",
    "accidental shooting",
    "epilepsy",
    "dilated cardiomyopathy",
    "thrombosis",
    "rheumatoid arthritis",
    "beheading",
    "leiomyosarcoma",
    "Ewing sarcoma",
    "sarcoma",
    "leptomeningeal carcinomatosis",
    "nasopharyngeal carcinoma",
    "small cell carcinoma",
    "myelodysplasia",
    "pulmonary embolism",
    "embolism",
    "suffocated",
    "cerebral haemorrhage",
    "assassination",
    "gastrointestinal hemorrhage",
    "intracerebral hemorrhage",
    "anaphylaxis",
    "progressive supranuclear palsy",
    "shelling",
    "pulmonary edema",
    "Jordanian bombings",
    "posterior cortical atrophy",
    "emphysema, bronchitis",
    "West Nile virus",
    "corticobasal degeneration",
    "heat stroke",
    "glioblastoma multiforme",
    "acute endocarditis",
    "arrhythmogenic right ventricular dysplasia",
    "alcoholism",
    "plane crash",
    "normal pressure hydrocephalus",
    "primary progressive aphasia",
    "dilated cardiomyopathy",
    "subdural haematoma",
    "arrhythmia",
    "thrombus",
    "thrombosis",
    "essential thrombocytosis",
    "thrombotic thrombocytopenic purpura",
    "vasculitis",
    "self defenestration",
    "ventricular tachycardia",
]

# Clearing out duplicate values and sorting in descending length order to use for extracting values
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [23]:
# Dropping info_parenth value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth"].notna()].index
        if "breaststroke" in df.loc[index, "info_parenth"]
        or "backstroke" in df.loc[index, "info_parenth"]
    ],
    "info_parenth",
] = ""

# Dropping info_parenth value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth"].notna()].index
        if "shot put" in df.loc[index, "info_parenth"]
        or "Aldershot" in df.loc[index, "info_parenth"]
    ],
    "info_parenth",
] = ""

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[df[df["info_parenth"] == "fallout shelter sign"].index, "info_parenth"] = ""

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[
    df[
        df["info_parenth"]
        == "HIV, President of the International AIDS Society 1994 1998"
    ].index,
    "info_parenth",
] = ""

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[
    df[df["info_parenth"] == "assassination of Orlando Letelier"].index, "info_parenth",
] = ""

# Dropping info_parenth value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth"].notna()].index
        if "Suicide" in df.loc[index, "info_parenth"]
    ],
    "info_parenth",
] = ""

<IPython.core.display.Javascript object>

#### Extracting `cause_of_death` Values from `info_parenth`

In [24]:
%%time

# Column to search
column = "info_parenth"

# Dataframe to search
dataframe = df[df[column].notna()]

# For loop to extract cause from column to cause_of_death
for cause in cause_of_death:
    for index in dataframe.index:
        item = df.loc[index, column]
        if item:
            if cause in item:
                if df.loc[index, 'cause_of_death']:
                    df.loc[index, 'cause_of_death'] = df.loc[index, 'cause_of_death'] + '/' + cause
                    df.loc[index, column] = item.replace(cause, "").strip()
                else:
                    df.loc[index, "cause_of_death"] = cause
                    df.loc[index, column] = item.replace(cause, "").strip()

# Checking number of cause_of_death values
print(
    f'There are {df["cause_of_death"].notna().sum()} values in cause_of_death column.\n'
)

There are 33461 values in cause_of_death column.

CPU times: total: 27.1 s
Wall time: 27.1 s


<IPython.core.display.Javascript object>

In [26]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

dunzo!


<IPython.core.display.Javascript object>

#### Verifying that Values in info_3_0 Are Exhausted

In [None]:
# # Verifying that `info_3_0` is exhausted
# df["info_3_0"].value_counts()

#### Dropping info_3_0

In [None]:
# # Dropping info_3_0
# df.drop("info_3_0", axis=1, inplace=True)

# # Checking sample
# df.sample()

#### Observations:
- Our search of column info_3_0 is finished and have dropped that column.
- We will now save our dataset and pick back up in a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean8.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean8.db")
# df.to_sql("wp_life_expect_clean8", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part ]()