# Wikipedia Notable Life Expectancies
# [Notebook 6 : Data Cleaning Part 5](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean5_thanak_2022_07_17.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean4.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean4", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 98061 rows and 38 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,social,spiritual,academia_humanities,business,sciences,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,known_for_1
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,r,ballet designer director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,,,3.091042,0,0,0,0,0,1,0,0,0,0,0,1,
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,ist,,and c,,,,,,,,68.0,,Ireland,,,2.564949,0,0,1,0,0,1,0,0,1,0,0,3,


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,social,spiritual,academia_humanities,business,sciences,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,known_for_1
98059,9,Aamir Liaquat Hussain,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,June,", since",,ian,MNA,,,,,,,,,50.0,,Pakistan,,"2002 2007, since 2018",4.60517,0,0,0,0,0,1,0,0,1,0,0,2,
98060,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,,member of the Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",,,1.386294,0,0,0,0,1,0,0,0,0,0,0,1,


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,social,spiritual,academia_humanities,business,sciences,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,known_for_1
15667,18,Pete Orr,", 46, American stock car racing driver, cancer.",https://en.wikipedia.org/wiki/Pete_Orr_(racing_driver),27,2002,November,,,stock car,cancer,,,,,,,,,46.0,,United States of America,,,3.332205,0,0,0,0,0,0,1,0,0,0,0,1,
75564,2,Michelle Medina,", 32, Ecuadorian singer, athlete and TV presenter, skin cancer.",https://en.wikipedia.org/wiki/Michelle_Medina,8,2019,July,,,,e TV,skin cancer,,,,,,,,32.0,,Ecuador,,,2.197225,0,0,0,0,0,1,1,0,0,0,0,2,
68326,29,David Bartov,", 94, Polish-born Israeli judge.",https://en.wikipedia.org/wiki/David_Bartov,3,2018,March,,,,,,,,,,,,,94.0,,Poland,Israel,,1.386294,0,0,0,0,0,0,0,0,1,0,0,1,
44432,12,Hans-Ekkehard Bob,", 96, German military pilot, World War II flying ace.",https://en.wikipedia.org/wiki/Hans-Ekkehard_Bob,8,2013,August,,,military,,,,,,,,,,96.0,,Germany,,,2.197225,0,0,0,0,0,0,0,1,0,0,0,1,
550,8,Edith Bullock,", 91, American businesswoman and politician.",https://en.wikipedia.org/wiki/Edith_Bullock,7,1994,May,,,woman ian,,,,,,,,,,91.0,,United States of America,,,2.079442,0,0,0,1,0,0,0,0,1,0,0,2,


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98061 entries, 0 to 98060
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        98061 non-null  object 
 1   name                       98061 non-null  object 
 2   info                       98061 non-null  object 
 3   link                       98061 non-null  object 
 4   num_references             98061 non-null  int64  
 5   year                       98061 non-null  int64  
 6   month                      98061 non-null  object 
 7   info_parenth               36661 non-null  object 
 8   info_1                     22 non-null     object 
 9   info_2                     98027 non-null  object 
 10  info_3                     48894 non-null  object 
 11  info_4                     10264 non-null  object 
 12  info_5                     1265 non-null   object 
 13  info_6                     181 non-null    obj

<IPython.core.display.Javascript object>

In [10]:
# Extracting administrator as academia_humanities to known_for_1
index = df[
    df["link"] == "https://en.wikipedia.org/wiki/John_Blackburn_(educator)"
].index
df.loc[index, "academia_humanities"] = 1
df.loc[index, "info_2"] = None

# Assigning law_enf_military_operator to entry with woman to serve in Foreign Legion
index = df[df["link"] == "https://en.wikipedia.org/wiki/Susan_Travers"].index
df.loc[index, "law_enf_military_operator"] = 1
df.loc[index, "info_2"] = None




### REMOVE THIS CELL AFTER RERUN OF NB4  !!!!!!!!!!!!!!!!!!!!!!!

<IPython.core.display.Javascript object>

In [11]:

### REMOVE THIS CELL AFTER RERUN OF NB4   !!!!!!!!!!!!!!!!!!!!!!!!!!


df[df["link"] == "https://en.wikipedia.org/wiki/Susan_Travers"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,social,spiritual,academia_humanities,business,sciences,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,known_for_1
17492,18,Susan Travers,", 94, only English woman to serve in the French Foreign Legion.",https://en.wikipedia.org/wiki/Susan_Travers,3,2003,December,,,,,,,,,,,,,94.0,,France,,,1.386294,0,0,0,0,0,0,0,1,0,0,0,0,law_enf_military_operator


<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we can pick up where we left off with extracting `known_for` values.
- First we will import `other_species` then proceed to make the next dictionary for searching.
- From this point we will begin with the top values in the column.

#### Loading `other_species`

In [14]:
# Loading other_species
other_species_df = pd.read_csv("other_species.csv")
other_species = list(
    set(other_species_df["species"].tolist())
)  # Removing duplicates as we have been adding to same csv file

<IPython.core.display.Javascript object>

### Extracting `known_for` Continued

#### Finding `known_for` Roles in info_1

In [22]:
# Obtaining values for column and their counts
roles_list = (
    df[df["num_categories"] == 0]["info_2"].value_counts(ascending=True).index.tolist()
)

<IPython.core.display.Javascript object>

In [35]:
"racehorse" in other_species

True

<IPython.core.display.Javascript object>

In [39]:
# Code to check each value
roles_list.pop()

'general'

<IPython.core.display.Javascript object>

#### Creating Lists for Each known_for Category

In [None]:
# Creating lists for each category
politics_govt_law = [
    'royal servant',
    'royal steward',
    'royal man',
    'royal courtier',
    'royal consort',
    'royal',
    'secretary general of the Communist Party', # Must go before law_enf_military_operator,
    'deputy director general of the Secretariat of the Pacific Community', # Must go before law_enf_military_operator
    'Deputy director general of the Civil Affairs Department of Hubei Province', # Before law_enf_military_operator
    'under secretary general of the United Nations', # Before law_enf_military_operator
    'secretary general of the Organization of States', # Before law_enf_military_operator
    'secretary general of the Presidency', # Before law_enf_military_operator
    'secretary general of the Ministry of Home Affairs', # Before law_enf_military_operator
    'secretary general of the Lok Sabha',
    'general of the Isle of Man',
    'ian fourth general of'
    'general of the', # Must go right before law_enf_military_operator  *****
    

    
]

arts = [
    'rapper',
    
]
sports = [
    'tennis',
    'coach secretary general of the Association of the', # Before law_enf_military_operator,
    'general secretary of FIFA'

]
sciences = [
    'mathematic',
    'archaeolog',
    'director general of the Department of Scientific Industrial Research',  # Before law_enf_military_operator
    'director general of the National Astronomical Observatory', # Before law_enf_military_operator
    'president of the IAU', # contains president
    'director general of the Council of Agricultural Research', # Before law_enf_military_operator
    
]

business = [

]
academia_humanities = [
    'royal librarian', # Must go before politics_govt_law
    'director general of the Bangla Academy', # Before politics_govt_law
]
law_enf_military_operator = [
    'military officer',
    'director general of the Space Agency',
    'commanding general of the',,
    'Army general',
    'Army brigadier general'
    
    
]
spiritual = [
    "secretary general of the s' Council", # Before politics_govt_law
    'general ity LDS Church',
    'general ity',
]
social = []
crime = [
    'serial killer',
    'fraudster',
    
]
event_record_other = [
    'claimant of royalty', # Must go before politics_govt_law
]
other_species = other_species + [
    'royal cocker spaniel', # Should go before politics_govt_law if searched with this dictionary
    'cocker spaniel'
]

In [49]:
# Example code to quick-screen values that may overlap categories
df.loc[[index for index in df.index if "general" in df.loc[index, "info"]], :][100:200]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,social,spiritual,academia_humanities,business,sciences,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,known_for_1
10721,13,Paramasiva Prabhakar Kumaramangalam,", 86, Indian Army general.",https://en.wikipedia.org/wiki/Paramasiva_Prabhakar_Kumaramangalam,26,2000,March,,,Army general,,,,,,,,,,86.0,,India,,,3.295837,0,0,0,0,0,0,0,0,0,0,0,0,
10894,23,Sir David Thorne,", 66, British army general.",https://en.wikipedia.org/wiki/David_Thorne_(British_Army_officer),6,2000,April,,,general,,,,,,,,,,66.0,,United Kingdom of Great Britain and Northern Ireland,,,1.94591,0,0,0,0,0,0,0,1,0,0,0,1,
10968,10,Sir Martin Farndale,", 71, British army general.",https://en.wikipedia.org/wiki/Martin_Farndale,3,2000,May,,,general,,,,,,,,,,71.0,,United Kingdom of Great Britain and Northern Ireland,,,1.386294,0,0,0,0,0,0,0,1,0,0,0,1,
11041,28,Maraden Panggabean,", 77, Indonesian Army general and Defense Minister.",https://en.wikipedia.org/wiki/Maraden_Panggabean,8,2000,May,,,Army general,,,,,,,,,,77.0,,Indonesia,,,2.197225,0,0,0,0,0,0,0,1,0,0,0,1,
11047,30,Iko Carreira,", 66, Angolan army general and politician.",https://en.wikipedia.org/wiki/Iko_Carreira,3,2000,May,,,general ian,,,,,,,,,,66.0,,Angola,,,1.386294,0,0,0,0,0,0,0,1,1,0,0,2,
11065,4,Sir James Glover,", 71, British army general.",https://en.wikipedia.org/wiki/James_Glover_(British_Army_officer),4,2000,June,,,general,,,,,,,,,,71.0,,United Kingdom of Great Britain and Northern Ireland,,,1.609438,0,0,0,0,0,0,0,1,0,0,0,1,
11230,13,James Ferguson,", 86, U.S. Air Force general.",https://en.wikipedia.org/wiki/James_Ferguson_(general),3,2000,July,,,general,,,,,,,,,,86.0,,United States of America,,,1.386294,0,0,0,0,0,0,0,1,0,0,0,1,
11236,14,Robert B. Landry,", 90, United States Air Force major general.",https://en.wikipedia.org/wiki/Robert_B._Landry,10,2000,July,,,,,,,,,,,,,90.0,,United States of America,,,2.397895,0,0,0,0,0,0,0,1,0,0,0,1,
11287,25,Fred C. Sheffey,", 71, United States Army major general, lung cancer.",https://en.wikipedia.org/wiki/Fred_C._Sheffey,5,2000,July,,,Army,lung cancer,,,,,,,,,71.0,,United States of America,,,1.791759,0,0,0,0,0,0,0,1,0,0,0,1,
11291,26,Dalkhan Khozhaev,", 39, Chechen historian, field commander, brigadier general and author, murdered.",https://en.wikipedia.org/wiki/Dalkhan_Khozhaev,3,2000,July,,,,field,brigadier general,ed,,,,,,,39.0,,Russia,,,1.386294,0,0,1,0,0,1,0,1,0,0,1,4,


<IPython.core.display.Javascript object>

In [None]:
https://en.wikipedia.org/wiki/Percy_Yutar
    

In [51]:
# Example code to quick-check a specific entry
df[df["info_2"] == "general ity"]

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2,info_parenth_copy,log_num_references,social,spiritual,academia_humanities,business,sciences,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,known_for_1
12761,16,Loren C. Dunn,", 70, American Mormon general authority.",https://en.wikipedia.org/wiki/Loren_C._Dunn,9,2001,May,,,general ity,,,,,,,,,,70.0,,United States of America,,,2.302585,0,1,0,0,0,1,0,0,0,0,0,2,
28903,27,Joseph C. Muren,", 73, American Mormon general authority.",https://en.wikipedia.org/wiki/Joseph_C._Muren,3,2009,July,,,general ity,,,,,,,,,,73.0,,United States of America,,,1.386294,0,1,0,0,0,1,0,0,0,0,0,2,


<IPython.core.display.Javascript object>

#### Observations:
- 

#### Updating known_for_dict_5 Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict_5 = {
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "spiritual": set(spiritual),
    "business": set(business),
    "sciences": set(sciences),
    "arts": set(arts),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "crime": set(crime),
    "event_record_other": set(event_record_other),
    #     "other_species": set(other_species),
}

#### Extracting Category from info_2 with known_for_dict_1

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_1

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_set in search_dict.items():
    for role in category_set:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['event_record_other'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows for each number of categories
df["num_categories"] = df[known_for_dict_1.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)

In [None]:
print('dunzo!')
# Chime notification when cell successfully executes
chime.success()

#### Finding `known_for` Roles in info_1

In [22]:
# Obtaining values for column and their counts
roles_list = (
    df[df["num_categories"] == 0]["info_2"].value_counts(ascending=True).index.tolist()
)

<IPython.core.display.Javascript object>

In [24]:
# # Code to check each value
# roles_list.pop()

<IPython.core.display.Javascript object>

#### Creating Lists for Each known_for Category

In [None]:
# Creating lists for each category
politics_govt_law = [
]

arts = []
sports = [

]
sciences = []

business = [

]
academia_humanities = []
law_enf_military_operator = []
spiritual = []
social = []
crime = []
event_record_other = []
other_species = other_species + [
]

In [None]:
# # Example code to quick-screen values that may overlap categories
# df.loc[[index for index in df.index if "King" in df.loc[index, "info"]], :]

In [None]:
## Example code to quick-check a specific entry
# df[df["info_2"] == "defence correspondent for"]

#### Observations:
- 

#### Updating known_for_dict_1 Dictionary of Category Keys and Specific Role Sets of Values

In [None]:
# Combining separate lists as sets into one dictionary
known_for_dict_1 = {
    "social": set(social),
    "academia_humanities": set(academia_humanities),
    "spiritual": set(spiritual),
    "business": set(business),
    "sciences": set(sciences),
    "arts": set(arts),
    "sports": set(sports),
    "law_enf_military_operator": set(law_enf_military_operator),
    "politics_govt_law": set(politics_govt_law),
    "crime": set(crime),
    "event_record_other": set(event_record_other),
    #     "other_species": set(other_species),
}

#### Extracting Category from info_2 with known_for_dict_1

In [None]:
%%time

# Dictionary version
search_dict = known_for_dict_1

# Column to check
column = 'info_2'

# Dataframe
dataframe = df[column].notna()

# For loop to find role in column and extract it as category
for category, category_set in search_dict.items():
    for role in category_set:
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if role in item:
                    df.loc[index, category] = 1
                    df.loc[index, column] = item.replace(role, '').strip()

# Checking a sample of rows
df[df['event_record_other'] ==1].sample(2)

#### Checking the Number of Rows without a First Category

In [None]:
# Checking the number of rows for each number of categories
df["num_categories"] = df[known_for_dict_1.keys()].sum(axis=1)

print(
    f'There are {len(df[df["num_categories"]==0])} entries without any known_for category.'
)