# Wikipedia Notable Life Expectancies

# [Notebook 5 of : Data Cleaning](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_clean4_thanak_2022_06_23.ipynb)

## Context

The


## Objective

The

### Data Dictionary

- Feature: Description

## Importing Necessary Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np
import re

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 200)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some visualization attributes
pd.set_option("max_colwidth", 150)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### Reading, Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_clean3.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_clean3", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 132652 rows and 23 columns.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
0,1,William Chappell,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,January,,,dancer,ballet designer and director,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,
1,1,Raymond Crotty,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,January,,,economist,writer,and academic,,,,,,,,68.0,,Ireland,


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
132650,9,Oleg Moliboga,", 69, Russian volleyball player, Olympic champion and coach.",https://en.wikipedia.org/wiki/Oleg_Moliboga,2,2022,June,1980.0,,volleyball player,Olympic champion and coach,,,,,,,,,69.0,,Russia,
132651,9,Zou Jing,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,June,,,engineer,member of the Chinese Academy of Engineering,,,,,,,,,86.0,,"China, People's Republic of",


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
131836,1,Mike Liles,", 76, American politician, member of the Tennessee House of Representatives .",https://en.wikipedia.org/wiki/Mike_Liles,3,2022,May,1991 1995,,politician,member of the Tennessee House of Representatives,,,,,,,,,76.0,,United States of America,
63856,18,Carlos Alberto Raffo,", 87, Argentine-born Ecuadorian footballer .",https://en.wikipedia.org/wiki/Carlos_Alberto_Raffo,2,2013,September,Emelec,,Ecuadorian footballer,,,,,,,,,,87.0,,Argentina,Ecuador
37359,6,Roy Howard,", 85, Australian cricketer.",https://en.wikipedia.org/wiki/Roy_Howard,1,2008,August,,,cricketer,,,,,,,,,,85.0,,Australia,
14921,11,Helena Carter,", 76, American actress.",https://en.wikipedia.org/wiki/Helena_Carter,21,2000,January,,,actress,,,,,,,,,,76.0,,United States of America,
101784,6,Grayston Burgess,", 86, English opera singer and conductor.",https://en.wikipedia.org/wiki/Grayston_Burgess,4,2019,March,,,opera singer and conductor,,,,,,,,,,86.0,,United Kingdom of Great Britain and Northern Ireland,


<IPython.core.display.Javascript object>

### Checking Data Types, Duplicates, and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132652 entries, 0 to 132651
Data columns (total 23 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   day             132652 non-null  object 
 1   name            132652 non-null  object 
 2   info            132652 non-null  object 
 3   link            132652 non-null  object 
 4   num_references  132652 non-null  object 
 5   year            132652 non-null  int64  
 6   month           132652 non-null  object 
 7   info_parenth    49830 non-null   object 
 8   info_1          35 non-null      object 
 9   info_2          132604 non-null  object 
 10  info_3          62571 non-null   object 
 11  info_4          12605 non-null   object 
 12  info_5          1497 non-null    object 
 13  info_6          216 non-null     object 
 14  info_7          31 non-null      object 
 15  info_8          6 non-null       object 
 16  info_9          1 non-null       object 
 17  info_10   

<IPython.core.display.Javascript object>

#### Loading `nation_map` from Pickle File to Dictionary nation_map

In [6]:
# Load the nation_map
with open("nation_map.pkl", "rb") as f:
    nation_map = pickle.load(f)

<IPython.core.display.Javascript object>

#### Loading `other_species` list from other_species.csv

In [7]:
# Loading other_species list
other_species_df = pd.read_csv("other_species.csv")
other_species = other_species_df["species"].tolist()

<IPython.core.display.Javascript object>

#### Observations:
- With our dataframe, `nation_map`, and `other_species` list loaded, we can proceed to extracting the other features.
- First, we will clean up the divided `info` columns by removing any remaining digits and nationality and country values.
- We will use the same functions from previous notebooks.

#### Function to Save Indices of Rows Matching Regular Expressions Pattern to a List and Print Number of Rows with Match 

In [8]:
# Define a function that takes dataframe, column name, and re pattern as arguments and returns list of indices
# for which column value matches re pattern
def rows_with_pattern(dataframe, column, pattern):
    """
    Takes input of dataframe, column name, and re pattern 
    and returns list of indices for rows that contain match
    for pattern anywhere within value for given column.
    
    dataframe: dataframe
    column: column name
    pattern: re pattern
    """
    index_list = []

    for i in dataframe.index:
        item = dataframe.loc[i, column]
        match = re.search(pattern, item)
        if match:
            index_list.append(i)
    print(
        f"There are {len(index_list)} rows with matching pattern in column '{column}'."
    )
    return index_list

<IPython.core.display.Javascript object>

#### Function to Use rows_with_pattern Function for Multiple Regular Expression Patterns

In [9]:
# Define a function that calls rows_with_pattern function for multiple re patterns
# returning a single list of indices for all rows with any pattern match


def multiple_patterns(dataframe, column, patterns):
    """
    Takes input dataframe, column, and list of re patterns and returns single list 
    of indices for rows in which a match for any pattern is found with re.search
    
    dataframe: dataframe
    column: column name
    patterns: list of re patterns
    """
    rows_combined = []

    # For loop to check each pattern
    for pattern in patterns:

        # List and number of rows matching each pattern
        print(pattern)
        rows_to_check = rows_with_pattern(dataframe, column, pattern)
        print("")

        # Add list for each pattern to combined list
        rows_combined += rows_to_check

    return rows_combined

<IPython.core.display.Javascript object>

### Removing Remaining Digits and Nationality/Country Values from Divided `info` Columns

#### List of Columns to Treat

In [10]:
# List of columns to treat
cols_lst = [
    "info_1",
    "info_2",
    "info_3",
    "info_4",
    "info_5",
    "info_6",
    "info_7",
    "info_8",
    "info_9",
    "info_10",
    "info_11",
    "info_parenth",
]

<IPython.core.display.Javascript object>

#### Removing Digits

In [11]:
# Regular expression for parenthesis and its contents
pattern = r"\d"

# For loop to find indices of rows that have pattern
rows_combined = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    rows_combined += rows_to_check

# Checking a sample of rows
df.loc[rows_combined, :].sample(2)

There are 0 rows with matching pattern in column 'info_1'.
There are 442 rows with matching pattern in column 'info_2'.
There are 2252 rows with matching pattern in column 'info_3'.
There are 1060 rows with matching pattern in column 'info_4'.
There are 69 rows with matching pattern in column 'info_5'.
There are 5 rows with matching pattern in column 'info_6'.
There are 0 rows with matching pattern in column 'info_7'.
There are 0 rows with matching pattern in column 'info_8'.
There are 0 rows with matching pattern in column 'info_9'.
There are 0 rows with matching pattern in column 'info_10'.
There are 0 rows with matching pattern in column 'info_11'.
There are 24403 rows with matching pattern in column 'info_parenth'.


Unnamed: 0,day,name,info,link,num_references,year,month,info_parenth,info_1,info_2,info_3,info_4,info_5,info_6,info_7,info_8,info_9,info_10,info_11,age,cause_of_death,place_1,place_2
110989,17,Iris Love,", 86, American archaeologist and dog breeder, COVID-19.",https://en.wikipedia.org/wiki/Iris_Love,16,2020,April,,,archaeologist and dog breeder,COVID 19,,,,,,,,,86.0,,United States of America,
102264,28,Alphonse D'Arco,", 86, American mobster, acting boss of the Lucchese crime family",https://en.wikipedia.org/wiki/Alphonse_D%27Arco,17,2019,March,"1990 1991, kidney disease death announced on this date",,mobster,acting boss of the Lucchese crime family,,,,,,,,,86.0,,United States of America,


<IPython.core.display.Javascript object>

In [12]:
# For loop to extract digits
for column in cols_lst:
    for index in set(rows_combined):
        item = df.loc[index, column]
        if item:
            match = re.search(pattern, item)
            if match:
                df.loc[index, column] = re.sub(pattern, "", item)

# Rechecking number and example rows after treatment
# For loop to find indices of rows that have pattern
recheck_rows = []
for column in cols_lst:
    dataframe = df[df[column].notna()]
    rows_to_check = rows_with_pattern(dataframe, column, pattern)
    recheck_rows += rows_to_check

There are 0 rows with matching pattern in column 'info_1'.
There are 0 rows with matching pattern in column 'info_2'.
There are 0 rows with matching pattern in column 'info_3'.
There are 0 rows with matching pattern in column 'info_4'.
There are 0 rows with matching pattern in column 'info_5'.
There are 0 rows with matching pattern in column 'info_6'.
There are 0 rows with matching pattern in column 'info_7'.
There are 0 rows with matching pattern in column 'info_8'.
There are 0 rows with matching pattern in column 'info_9'.
There are 0 rows with matching pattern in column 'info_10'.
There are 0 rows with matching pattern in column 'info_11'.
There are 0 rows with matching pattern in column 'info_parenth'.


<IPython.core.display.Javascript object>

#### Removing Any Remaining Matches with  `nation_map` Keys and Values

In [13]:
%%time

# For loop to extract remaining information matching items in nation_map
for column in cols_lst:
    dataframe = df[df[column].notna()]
    for nationality, country in nation_map.items():
        for index in dataframe.index:
            item = df.loc[index, column]
            if item:
                if nationality in item or country in item:
                    df.loc[index, column] = item.replace(nationality, "").strip().replace(country,'').strip()

CPU times: total: 14min 4s
Wall time: 14min 4s


<IPython.core.display.Javascript object>

#### Observations:
- After that bit of tidying, we can proceed to extracting `known_for` values.
- The bulk of these values should be in `info_2`, according to the Wikipedia defined fields, so we will start there.

## Extracting `known_for` Data
Our goal will be to have some broader categories into which the specific values will fit.  `known_for` is a diverse feature, in that an individual may be known for a long-term role or roles, a specific event, a relationship with another person who is famous, etc.  So, to some extent we will see what we find and adapt as we go.

In [317]:
# Obtaining values for column and their counts
col_values = df["info_2"].value_counts()

# Creating a list for values that occur more than set number of time
roles_list = [index for index in col_values.index if col_values[index] > 30]

# Checking length of list
len(roles_list)

448

<IPython.core.display.Javascript object>

In [695]:
# Using pop to check list items and add to associated dictionary below
roles_list.pop()

'Olympic fencer'

<IPython.core.display.Javascript object>

In [398]:
politics_govt_law_related = set(
    [
        "politician",
        "economist",
        "attorney",
        "trade unionist",
        "unionist",
        "aristocrat",
        "diplomat",
        "lawyer",
        "activist",
        "civil rights",
        "federal",
        "judge",
        "political",
        "politic",
        "royal",
        "civil servant",
        "jurist",
        "judge",
        "conservationist",
        "government official",
        "government",
        "barrister",
        'militant',
        'environmentalist',
        'public servant'
    ]
)
arts = set(
    [
        "actor",
        "dancer",
        "choreographer",
        "model",
        "hockey",
        "soccer",
        "television",
        "jazz",
        "singer",
        "composer",
        "conductor",
        "journalist",
        "writer",
        "saxophonist",
        "film director",
        "comedian",
        "photojournalist",
        "poet",
        "actress",
        "film",
        "editor",
        "drummer",
        "producer",
        "songwriter",
        "publisher",
        "author",
        "violinist",
        "rapper",
        "musician",
        "animator",
        "filmmaker",
        "pianist",
        "historian",
        "comic",
        "screenwriter",
        "fashion",
        "designer",
        "guitarist",
        "voice",
        "opera",
        "cinematographer",
        "playwright",
        "cartoonist",
        "sculptor",
        "novelist",
        "photographer",
        "architect",
        "painter",
        "artist",
        "disc jockey",
        "dj",
        "DJ",
        "bridge player",
        "tenor",
        "trombonist",
        "filmmaker",
        "ballerina",
        "bassist",
        "film critic",
        "critic",
        "personality",
        'organist',
        'operatic',
        'lyricist',
        'translator',
        'visual artist',
        'soprano',
        'cellist',
        'broadcaster',
        'chef',
        'literary critic',
        'ballet',
        'illustrator',
        'theatre director', 
        'trumpeter',
        'presenter',
        "sportscaster",

    ]
)
sports = set(
    [
        "football",
        "footballer",
        "Olympic",
        "skier",
        "cricket",
        "soccer",
        "sprinter",
        "equestrian",
        "gymnast",
        "fencer",
        "chess",
        "wrestler",
        "swimmer",
        "basketball",
        "hurler",
        "racehorse",
        "sailor",
        "rower",
        "rugby",
        "athlete",
        "golfer",
        "boxer",
        "tennis",
        "cyclist",
        "racing",
        "driver",
        "cricketer",
        "baseball",
        "speedway rider",
        "speedway",
        "rider",
        "badminton",
        "sport shooter",
        "runner",
        "umpire",
        "sports",
        "judoka",
        "sportswriter",
        "volleyball",
        "track and field",
        "bobsledder",
        'canoer',
        'bodybuilder',
        'skater',
        'curler',
        'Olympic diver',
        'martial artist',
        'racer', 
        'handball',
        'ski jumper',
        'racehorse trainer',
        'racecar driver',
        'hurdler',
        'polo',
        'Olympic shooter',
        'weightlifter',
        'Baseball',
        'mountaineer',
        'jockey',
        'Olympic sports shooter',
        'referee'
    ]
)
sciences = set(
    [
        "engineer",
        "physicist",
        "geologist",
        "psychiatrist",
        "botanist",
        "biologist",
        "anthropologist",
        "astronomer",
        "biochemist",
        "scientist",
        "computer",
        "archaeologist",
        "psychologist",
        "sociologist",
        "physician",
        "chemist",
        "physicist",
        "mathematician",
        "cosmonaut",
        "pediatrician",
        "astronaut",
        "entomologist",
        "cardiologist",
        "doctor",
        "nurse",
        'immunologist',
        'meteorologist',
        'medical researcher',
        'ornithologist',
        'neuroscientist',
        'microbiologist',
        'zoologist',
        'geographer',
        'inventor',
        'geneticist',
        'surgeon',
        'astrophysicist',
        'statistician',
        
        
    ]
)
business = set(["executive", "business", "businessman", "banker", "entrepreneur", 'real estate developer','restaurateur', 'businesswoman'])
scholar_academia_education = set(
    [
        "scholar",
        "linguist",
        "educator",
        "philosopher",
        "academic",
        "historian",
        "educationalist",
        'philologist',
        'librarian',
        'industrialist',
        'professor',
        'musicologist'
    ]
)
law_enf_military_operator = set(
    [
        "officer",
        "army",
        "Army",
        "general",
        "police",
        "admiral",
        "soldier",
        "Air Force",
        "intelligence",
        "major",
        "lieutenant",
        'admiral',
        'fighter pilot',
        'pilot',
        'naval',
        'Navy',
        'aviator'
    ]
)
religion = set(
    [
        "rabbi",
        "Catholic",
        "priest",
        "Anglican",
        "cardinal",
        "theologian",
        "prelate",
        "Orthodox",
        "Episcopal",
        "bishop",
        "Jesuit",
        'hierarch',
        'Islamic',
        'religious leader'
    ]
)
social = set(["philanthropist", 'socialite'])
crime = set(["serial killer", "murderer", "convicted", "terrorist", 'mobster', 'criminal'])
victim = set(["Holocaust survivor"])
age = set(['supercentenarian'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### Adding `known_for` Column

In [15]:
# # Initializing known_for column
# df["known_for"] = ""

<IPython.core.display.Javascript object>

#### Dictionary of `known_for` Keys and List of Values

In [14]:
# # Initializing dictionary of known_for keys and list of values
# known_for_dict = {
#     "relative": [
#         "wife of",
#         "husband of",
#         "sister of",
#         "brother of",
#         "mother of",
#         "father of",
#         "daughter of",
#         "son of",
#         "granddaughter of",
#         "grandson of",
#         "grandmother of",
#         "grandfather of",
#         "aunt of",
#         "uncle of",
#     ]
# }

<IPython.core.display.Javascript object>

In [16]:
# # Column to check
# column = "info_2"

# # Extract to Column
# extract_to = "known_for"

# # For loop to extract value from one column to another
# for key, value in known_for_dict.items():
#     for entry in value:
#         dataframe = df[(df[column].notna()) & (df[extract_to] == "")]
#         for index in dataframe.index:
#             item = df.loc[index, column]
#             if entry in item:
#                 df.loc[index, extract_to] = key
#                 df.loc[index, column] = item.replace(entry, "").strip()

# # Check sample of rows
# df[df[extract_to] != ""]

<IPython.core.display.Javascript object>

In [17]:
print("dunzo!")
chime.success()

dunzo!


<IPython.core.display.Javascript object>