# Wikipedia Notable Life Expectancies
# [Notebook  : Data Cleaning Part 8](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean8_thanak_2022_07_26.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_clean7.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean7.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean7", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98040 rows and 27 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,68.0,,Ireland,,,2.564949,0,0,0,1,0,1,0,0,1,0,0,0,3


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
98038,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,0,2
98039,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,86.0,,"China, People's Republic of",,,1.386294,1,0,0,0,0,0,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
30683,28,Edward L. Athey,", 88, American football, basketball and baseball player, baseball and basketball coach.",https://en.wikipedia.org/wiki/Edward_L._Athey,9,2010,February,,88.0,,United States of America,,,2.302585,0,0,0,0,0,0,1,0,0,0,0,0,1
78332,27,Sachhidanand Narayan Deb,", 98, Indian politician, MLA .",https://en.wikipedia.org/wiki/Sachhidanand_Narayan_Deb,6,2019,December,,98.0,,India,,1971 1977,1.94591,0,0,0,0,0,0,0,0,1,0,0,0,1
89761,5,Lucinda Franks,", 74, American journalist , cancer.",https://en.wikipedia.org/wiki/Lucinda_Franks,17,2021,May,"Pulitzer Prize, , , winner",74.0,cancer,United States of America,,"Pulitzer Prize, , , winner 1971",2.890372,0,0,0,0,0,1,0,0,0,0,0,0,1
77611,13,Sean Bonney,", 50, English poet.",https://en.wikipedia.org/wiki/Sean_Bonney,9,2019,November,,50.0,,United Kingdom of Great Britain and Northern Ireland,,,2.302585,0,0,0,0,0,1,0,0,0,0,0,0,1
12954,19,Stanley Mosk,", 88, American jurist, politician, and attorney.",https://en.wikipedia.org/wiki/Stanley_Mosk,43,2001,June,,88.0,,United States of America,,,3.78419,0,0,0,0,0,0,0,0,1,0,0,0,1


<IPython.core.display.Javascript object>

### Checking Data Types and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98040 entries, 0 to 98039
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98040 non-null  object 
 1   name                       98040 non-null  object 
 2   info                       98040 non-null  object 
 3   link                       98040 non-null  object 
 4   num_references             98040 non-null  int64  
 5   year                       98040 non-null  int64  
 6   month                      98040 non-null  object 
 7   info_parenth               36660 non-null  object 
 8   age                        98040 non-null  float64
 9   cause_of_death             33336 non-null  object 
 10  place_1                    97887 non-null  object 
 11  place_2                    8116 non-null   object 
 12  info_parenth_copy          36660 non-null  object 
 13  log_num_references         98040 non-null  flo

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting known_for and `cause_of_death` values.
- As all of the numbered `info_` columns have been searched and dropped, we are left with `info_parenth` (and its copy).  
- By definition, we would expect `info_parenth` to contain non-essential values.  The column contains a lot of values, so we will begin by looking only for `known_for` information for the few entries that do not yet have a `known_for` category.
- Then we can consider an approach to searching for any `cause_of_death` information in `info_parenth`.

### Extracting Remaining `known_for` for Entries Still Lacking a `known_for` Category

#### Checking Entries Lacking lacking `known_for` Category

In [54]:
# Checking entries with num_categories == 0
df[df["num_categories"] == 0]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
11490,10,Chandra Khonnokyoong,", 91, Thai .",https://en.wikipedia.org/wiki/Chandra_Khonnokyoong,25,2000,September,,91.0,,Thailand,,spiritual teacher,3.258097,0,0,0,0,0,0,0,0,0,0,0,0,0
12052,3,Kung Fu,", 49, Mexican , arterial hyper tension.",https://en.wikipedia.org/wiki/Kung_Fu_(wrestler),11,2001,January,,49.0,arterial hyper tension,Mexico,,wrestler,2.484907,0,0,0,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

#### Observations:
- We can see some additional information in `info_parenth` for some of the values.
- Since we previously separated the information contained in parentheses from the original `info` column, we will maintain `info_parenth` intact, and utilize `info_parenth_copy` for any value extraction.
- We will hard-code the missing `known_for` info for the entries lacking that information, since there are only 2, and we have the link readily available to find it or it is apparent in the link value.  

#### Finding `known_for` Roles in `info_parenth_copy` for Entries Lacking any Category

In [34]:
# # Obtaining values for column and their counts
# roles_list = (
#     df[df["num_categories"] == 0]["info_parenth_copy"]
#     .value_counts(ascending=True)
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [35]:
# # Code to check each value
# value = roles_cause_list.pop()
# value

<IPython.core.display.Javascript object>

In [36]:
# # Create specific_roles_cause_list for above popped value
# specific_roles_list = (
#     df.loc[
#         [
#             index
#             for index in df[df["info_parenth_copy"].notna()].index
#             if value in df.loc[index, "info_parenth_copy"]
#         ],
#         "info_parenth_copy",
#     ]
#     .value_counts()
#     .index.tolist()
# )

<IPython.core.display.Javascript object>

In [37]:
# # Viewing list sorted by descending length to copy to dictionary below and screen values
# sorted(specific_roles_list, key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [38]:
# # Example code to quick-check a specific entry
# df[df["info_parenth_copy"] == value]

<IPython.core.display.Javascript object>

#### Creating Lists for Each `known_for` Category

In [52]:
# Creating lists for each category and sorting by decreasing length and removing duplicates

politics_govt_law = ["and courier Mahatma Gandhi"]
politics_govt_law = sorted(
    list(set(politics_govt_law)), key=lambda x: len(x), reverse=True
)

arts = ["née Eva Narcissus Boyd, ,  pop singer"]
arts = sorted(list(set(arts)), key=lambda x: len(x), reverse=True)

sports = ["professional wrestler", "wrestler"]
sports = sorted(list(set(sports)), key=lambda x: len(x), reverse=True)

sciences = []
sciences = sorted(list(set(sciences)), key=lambda x: len(x), reverse=True)

business_farming = []
business_farming = sorted(
    list(set(business_farming)), key=lambda x: len(x), reverse=True
)

academia_humanities = ["scholar"]
academia_humanities = sorted(
    list(set(academia_humanities)), key=lambda x: len(x), reverse=True
)

law_enf_military_operator = []
law_enf_military_operator = sorted(
    list(set(law_enf_military_operator)), key=lambda x: len(x), reverse=True
)

spiritual = ["claimed,  Vedic", "spiritual teacher"]
spiritual = sorted(list(set(spiritual)), key=lambda x: len(x), reverse=True)

social = []
social = sorted(list(set(social)), key=lambda x: len(x), reverse=True)

crime = []
crime = sorted(list(set(crime)), key=lambda x: len(x), reverse=True)

event_record_other = [
    "in 2005,  student, missing since 2005 declared legally dead on this date"
]
event_record_other = sorted(
    list(set(event_record_other)), key=lambda x: len(x), reverse=True
)

other_species = []
other_species = sorted(list(set(other_species)), key=lambda x: len(x), reverse=True)

cause_of_death = []
cause_of_death = sorted(list(set(cause_of_death)), key=lambda x: len(x), reverse=True)

<IPython.core.display.Javascript object>

In [53]:
# Hard-coding info_parenth_copy for entry lacking known_for values
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Chandra_Khonnokyoong"].index,
    "info_parenth_copy",
] = "spiritual teacher"


# Hard-coding info_parenth_copy for entry lacking known_for values
df.loc[
    df[df["link"] == "https://en.wikipedia.org/wiki/Kung_Fu_(wrestler)"].index,
    "info_parenth_copy",
] = "wrestler"

<IPython.core.display.Javascript object>

#### Creating `known_for_dict` Dictionary of Category Keys and Specific Role Lists of Values

In [55]:
# Combining separate lists into one dictionary
known_for_dict = {
    "social": social,
    "spiritual": spiritual,
    "academia_humanities": academia_humanities,
    "business_farming": business_farming,
    "sciences": sciences,
    "politics_govt_law": politics_govt_law,
    "law_enf_military_operator": law_enf_military_operator,
    "crime": crime,
    "event_record_other": event_record_other,
    "other_species": other_species,
    "arts": arts,
    "sports": sports,
}

<IPython.core.display.Javascript object>

#### Extracting `known_for` Categories Values from `info_parenth_copy` for Entries without a Category

In [56]:
%%time

# Column to check
column = 'info_parenth_copy'

# Start dataframe
dataframe = df[(df[column].notna()) & (df['num_categories']==0)]
                
# For loop to find role in column and extract it as category
for category, category_lst in known_for_dict.items():
    for role in category_lst:
        for index in dataframe.index:
                item = df.loc[index, column]
                if item:
                    if role in item:
                        df.loc[index, category] = 1
                        df.loc[index, column] = item.replace(role, '').strip()

# Calculating num_categories
df["num_categories"] = df[known_for_dict.keys()].sum(axis=1)

CPU times: total: 15.6 ms
Wall time: 18.5 ms


<IPython.core.display.Javascript object>

#### Checking Updated `num_categories` Value Counts

In [57]:
# Checking Updated num_categories Value Counts
df["num_categories"].value_counts()

1    84088
2    12806
3     1107
4       36
5        3
Name: num_categories, dtype: int64

<IPython.core.display.Javascript object>

#### Observations:
- All entries now have at least one `known_for` category.
- Next, we will proceed to examine the values in `cause_of_death` to potentially guide finding that information in `info_parenth_copy` for entries that lack a value for it.

### Searching for Remaining `cause_of_death` Values in `info_parenth_copy`

In [72]:
# Creating list of cause_of_death values
cause_list = df["cause_of_death"].value_counts(ascending=True).index.tolist()

<IPython.core.display.Javascript object>

In [521]:
# Checking the cause_of_death values starting with most frequent
value = cause_list.pop()
value

'COPD'

<IPython.core.display.Javascript object>

In [524]:
df["info_parenth_copy"].value_counts()

, ,                      3124
                         2944
,                        1214
national team             167
1952                      157
                         ... 
2008, 2011                  1
"Sea Cruise"                1
1964 and coach Bacău        1
Barnet Museum               1
2002 2007, since 2018       1
Name: info_parenth_copy, Length: 16924, dtype: int64

<IPython.core.display.Javascript object>

In [522]:
# Creating list of info_parenth_copy values that contain cause_of_death value
df.loc[
    [
        index
        for index in df[df["info_parenth_copy"].notna()].index
        if value in df.loc[index, "info_parenth_copy"]
    ],
    "info_parenth_copy",
].value_counts().index.tolist()

['Ambassador and diplomat, Ghana to Czechoslovakia 1974 1976; COPD 1989 1992',
 'COPD, ,',
 'Police Department, COPD and police officer',
 'Miss America 1942 and actress COPD,']

<IPython.core.display.Javascript object>

In [523]:
# Checking specific entries
df[
    df["info_parenth_copy"]
    == "Ambassador and diplomat, Ghana to Czechoslovakia 1974 1976; COPD 1989 1992"
]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,other_species,num_categories
46735,10,Shirley Temple,", 85, American actress , .",https://en.wikipedia.org/wiki/Shirley_Temple,145,2014,February,"Ambassador and diplomat, to Czechoslovakia ; COPD",85.0,,United States of America,,"Ambassador and diplomat, Ghana to Czechoslovakia 1974 1976; COPD 1989 1992",4.983607,0,0,0,0,0,1,0,0,0,0,0,0,1


<IPython.core.display.Javascript object>

In [314]:
# Creating list for cause_of_death
cause_of_death = [
    "pancreatic cancer",
    "oesophageal cancer",
    "ovarian cancer",
    "testicular cancer",
    "breast cancer",
    "cancer",
    "cancer",
    "cancer",
    "lung cancer",
    "liver cancer",
    "bowel cancer",
    "adrenal cancer",
    "cancer Russian",
    "prostate cancer",
    "parotid cancer",
    "nasopharynx cancer",
    "endometrial cancer,",
    "peritoneal cancer",
    "heart attack",
    "COVID",
    "congestive heart failure",
    "heart failure",
    "ischemic heart failure",
    "AIDS, pneumonia,",
    "pneumonia",
    "bronchial pneumonia",
    "pneumonia",
    "pneumonia, infarctions",
    "stroke",
    "heat stroke",
    "gunshot wounds",
    "shot",
    "traffic collision",
    "natural causes disease",
    "natural causes",
    "suspected suicide",
    "suicide by drowning",
    "suicide by hydrogen sulfide",
    "suicide by hanging",
    "suicide",
    "Alzheimer disease",
    "leukemia",
    "Parkinson disease",
    "heart disease",
    "car accident",
    "injuries due to a fall",
    "subdural hematoma, fall",
    "fall",
    "multiple organ failure",
    "AIDS, lymphoma,",
    "Hodgkin lymphoma",
    "gastric lymphoma",
    "stomach ulcer complications",
    "oesophageal cancer",
    "plane crash",
    "amyotrophic lateral sclerosis",
    "ovarian cancer",
    "euthanized",
    "uveal melanoma",
    "pulmonary emphysema",
    "emphysema, bronchitis",
    "emphysema",
    "Lewy body dementia",
    "multiple myeloma, blood cancer",
    "renal failure",
    "intracerebral hemorrhage",
    "kidney disease",
    "liver failure",
    "pulmonary embolism",
    "congestive heart failure",
    "homicide",
    "idiopathic pulmonary fibrosis",
    "pulmonary fibrosis",
    "abdominal aortic aneurysm",
    "sepsis",
    "car accident",
    "accidental shooting",
    'traffic collision',
    'glioblastoma multiforme',
    'Jordanian bombings',
    'accidental shooting',
    'pulmonary edema',
    'septic infection',
    'myelodysplastic syndrome',
    'multiple sclerosis',
    'AIDS',
    'AIDS, lymphoma',
    'multiple organ failure',
    'pulmonary emphysema',
    'executed',
    'aortic dissection',
    'progressive supranuclear palsy',
    'multiple organ dysfunction syndrome',
    'idiopathic pulmonary fibrosis',
    'Hodgkin lymphoma',
    'Lewy body dementia',
    'COPD'
]

<IPython.core.display.Javascript object>

In [403]:
# Dropping info_parenth_copy value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth_copy"].notna()].index
        if "breaststroke" in df.loc[index, "info_parenth_copy"]
        or "backstroke" in df.loc[index, "info_parenth_copy"]
    ],
    "info_parenth_copy",
] = ""

# Dropping info_parenth_copy value for entries to avoid incorrect cause_of_death
df.loc[
    [
        index
        for index in df[df["info_parenth_copy"].notna()].index
        if "shot put" in df.loc[index, "info_parenth_copy"]
        or "Aldershot" in df.loc[index, "info_parenth_copy"]
    ],
    "info_parenth_copy",
] = ""

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[df[df['info_parenth_copy']=='fallout shelter sign'].index, 'info_parenth_copy']=''

# Dropping info_parent_copy value for entry to avoid incorrect cause_of_death
df.loc[df[df['info_parenth_copy']=='HIV, President of the International AIDS Society 1994 1998'].index, 'info_parenth_copy']=''


<IPython.core.display.Javascript object>

In [None]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

#### Verifying that Values in info_3_0 Are Exhausted

In [2]:
# # Verifying that `info_3_0` is exhausted
# df["info_3_0"].value_counts()

#### Dropping info_3_0

In [None]:
# # Dropping info_3_0
# df.drop("info_3_0", axis=1, inplace=True)

# # Checking sample
# df.sample()

#### Observations:
- Our search of column info_3_0 is finished and have dropped that column.
- We will now save our dataset and pick back up in a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean8.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean8.db")
# df.to_sql("wp_life_expect_clean8", conn, index=False)

# # Chime notification when cell executes
# chime.success()

# [Proceed to Data Cleaning Part ]()