# Wikipedia Notable Life Expectancies
# [Notebook 11: Data Pre-processing ](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_preproc_2022_10_06.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
# import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# randomized data splitting
from sklearn.model_selection import train_test_split

# building regression model
import statsmodels.api as sm

# check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# check linear regression assumptions
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pylab
import scipy.stats as stats
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

# to compare fit between models
from scipy.stats.distributions import chi2

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 211)

# To set some dataframe visualization attributes
pd.set_option("max_colwidth", 150)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some plot visualization attributes
sns.set_theme()
sns.set_palette(
    (
        "midnightblue",
        "goldenrod",
        "maroon",
        "darkolivegreen",
        "cadetblue",
        "tab:purple",
        "yellowgreen",
    )
)
plt.rc("font", size=12)
plt.rc("axes", titlesize=15)
plt.rc("axes", labelsize=14)
plt.rc("xtick", labelsize=13)
plt.rc("ytick", labelsize=13)
plt.rc("legend", fontsize=13)
plt.rc("legend", fontsize=14)
plt.rc("figure", titlesize=16)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_EDA.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_EDA.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_EDA", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 97549 rows and 25 columns.


Unnamed: 0,info,link,num_references,year,info_parenth,age,cause_of_death,place_1,place_2,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,recip_num_references,region,prior_region,known_for
0,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,,86.0,,United Kingdom of Great Britain and Northern Ireland,,0,0,0,0,0,1,0,0,0,0,0,1,0.048,Europe,,arts
1,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,,68.0,,Ireland,,0,0,0,1,0,1,0,0,1,0,0,3,0.083,Europe,,three_to_five


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,info,link,num_references,year,info_parenth,age,cause_of_death,place_1,place_2,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,recip_num_references,region,prior_region,known_for
97547,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,"2002 2007, since 2018",50.0,,Pakistan,,0,0,0,0,0,1,0,0,1,0,0,2,0.01,Asia,,two
97548,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,,86.0,,"China, People's Republic of",,1,0,0,0,0,0,0,0,0,0,0,1,0.333,Asia,,sciences


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,info,link,num_references,year,info_parenth,age,cause_of_death,place_1,place_2,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,recip_num_references,region,prior_region,known_for
20723,", 82, British Olympic skater and film actress.",https://en.wikipedia.org/wiki/Belita,6,2005,,82.0,,United Kingdom of Great Britain and Northern Ireland,,0,0,0,0,0,1,1,0,0,0,0,2,0.167,Europe,,two
70599,", 95, German-born British physicist and researcher.",https://en.wikipedia.org/wiki/Lewis_Elton,8,2018,,95.0,,Germany,United Kingdom of Great Britain and Northern Ireland,1,0,0,0,0,0,0,0,0,0,0,1,0.125,Europe,Europe,sciences
17011,", 84, British classical music impresario.",https://en.wikipedia.org/wiki/Ian_Hunter_(impresario),7,2003,,84.0,,United Kingdom of Great Britain and Northern Ireland,,0,0,0,0,0,1,0,0,0,0,0,1,0.143,Europe,,arts
10613,", 89, Polish-British painter.",https://en.wikipedia.org/wiki/Josef_Herman,12,2000,,89.0,,Poland,United Kingdom of Great Britain and Northern Ireland,0,0,0,0,0,1,0,0,0,0,0,1,0.083,Europe,Europe,arts
85641,", 65, Indonesian politician, regent of North Toraja , COVID-19.",https://en.wikipedia.org/wiki/Frederik_Batti_Sorring,3,2020,2011 2016,65.0,COVID,Indonesia,,0,0,0,0,0,0,0,0,1,0,0,1,0.333,South East Asia,,politics_govt_law


<IPython.core.display.Javascript object>

### Checking Data Types and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97549 entries, 0 to 97548
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   info                       97549 non-null  object 
 1   link                       97549 non-null  object 
 2   num_references             97549 non-null  int64  
 3   year                       97549 non-null  int64  
 4   info_parenth               36536 non-null  object 
 5   age                        97549 non-null  float64
 6   cause_of_death             33179 non-null  object 
 7   place_1                    97406 non-null  object 
 8   place_2                    5442 non-null   object 
 9   sciences                   97549 non-null  int64  
 10  social                     97549 non-null  int64  
 11  spiritual                  97549 non-null  int64  
 12  academia_humanities        97549 non-null  int64  
 13  business_farming           97549 non-null  int

<IPython.core.display.Javascript object>

#### Observations:
- There are 97549 rows and 25 columns.
- With our dataset loaded, we begin data pre-processing.
- We will need to typecast our categorical columns again, as well as `year`, after dropping unnecessary columns.

## Data Pre-processing
- At the outset, we can drop the following columns:
    - `num_references`
    - `cause_of_death`
    - `place_1`
    - `place_2`
- We can also drop the entries with `event_record_other` as their sole `known_for` category.
- We will make a decision regarding `known_for`, `num_categories`, and `known for` category columns and drop the extraneous columns after treatment.
- After outlier detection is complete, we can drop `info`, `info_parenth`, and `link` columns.

In [6]:
# Dropping unnecessary columns
cols_to_drop = ["num_references", "cause_of_death", "place_1", "place_2"]
df.drop(cols_to_drop, axis=1, inplace=True)

# Checking new shape
df.shape

(97549, 21)

<IPython.core.display.Javascript object>

#### Dropping Entries with `event_record_other` as Sole Category

In [7]:
# Dropping entries with event_record_other as sole category
index = df[df["known_for"] == "event_record_other"].index
df.drop(index, inplace=True)
df.reset_index(inplace=True, drop=True)

# Check new shape
df.shape

(97092, 21)

<IPython.core.display.Javascript object>

#### Typecasting Categorical Columns

In [8]:
# Typecasting categorical columns
cols_to_cast = ["year", "region", "prior_region", "known_for"]
df[cols_to_cast] = df[cols_to_cast].astype("category")

<IPython.core.display.Javascript object>

### Treating `known_for` Categories
We will take the approach of dropping the `known_for`column that was added during EDA and keeping and treating the individual `known for` category columns.

We will replace each value with its value divided by then entries total `num_categories`.  So, the resulting `num_categories` for any individual will equal 1.  Entries with multiple categories will have evenly split values for their respective `known for` categories.  Strictly speaking, it's not feasible to accurately assign proportions of these categories, which is why the approach of creating a single `known_for` column was applied during EDA, with additional categories for individuals with multiple categories.  The suspicion is that relatively more information was lost with that approach than will be skewed with equal division between categories.

#### Dividing `known for` Category Column Value by `num_categories`

In [9]:
# Replacing known for category column values with value/num_categories
category_cols = [
    "sciences",
    "social",
    "spiritual",
    "academia_humanities",
    "business_farming",
    "arts",
    "sports",
    "law_enf_military_operator",
    "politics_govt_law",
    "crime",
    "event_record_other",
]

# For loop to calculate new known for value for entries with multiple categories
for column in category_cols:
    for index in df[
        (df["known_for"] == "three_to_five") | (df["known_for"] == "two")
    ].index:
        value = df.loc[index, column]
        total = df.loc[index, "num_categories"]
        df.loc[index, column] = value / total

# Updating num_references column
df["num_categories"] = df[category_cols].sum(axis=1)

# Checking unique values in num_categories
print("Unique values in num_categories:\n\n", df["num_categories"].value_counts())

# Checking a sample of rows
df.sample(2)

Unique values in num_categories:

 1.000    97092
Name: num_categories, dtype: int64


Unnamed: 0,info,link,year,info_parenth,age,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,recip_num_references,region,prior_region,known_for
96537,", 16, American beauty pageant and reality show contestant , suicide.",https://en.wikipedia.org/wiki/Kailia_Posey,2022,,16.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.2,North America,,arts
52283,", 84, Israeli lawyer and diplomat, Ambassador to France .",https://en.wikipedia.org/wiki/Meir_Rosenne,2015,1979 1983 and United States 1983 1987,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.333,Middle East,,politics_govt_law


<IPython.core.display.Javascript object>

#### Observations:
- With the updated `known for` category column values, we can proceed to drop `num_categories` and `known_for` columns.

#### Dropping `num_categories` and `known_for`

In [11]:
# Dropping num_categories and known_for columns
cols_to_drop = ["num_categories", "known_for"]
df.drop(cols_to_drop, axis=1, inplace=True)

# Check new shape
df.shape

(97092, 19)

<IPython.core.display.Javascript object>

## Splitting Dataset

## Outlier Detection

#### Function to View Outliers for Each Predictor Column

In [None]:
# Define a function that returns new dataframe of count and percentage of outliers of input dataframe columns
def outside_IQRends_pls1pt5(dataframe):
    """
    Takes input numeric dataframe and returns a dataframe of the column count 
    and percentage of values that are outliers, defined by 
    x < (Q1 - 1.5*IQR) ∪ (Q3 + 1.5*IQR) > x, where x is the value of the outlier.

    """
    count_lst = []
    perc_lst = []

    for column in dataframe:
        Q1 = dataframe[column].quantile(0.25)
        Q3 = dataframe[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_whisker = Q1 - 1.5 * IQR
        upper_whisker = Q3 + 1.5 * IQR

        count = len(
            dataframe[
                (dataframe[column] < lower_whisker)
                | (dataframe[column] > upper_whisker)
            ]
        )
        percentage = round(count / len(dataframe[column]) * 100, 2)

        count_lst.append(count)
        perc_lst.append(percentage)

    outlier_dict = {"outlier_count": count_lst, "percentage_outliers": perc_lst}

    return pd.DataFrame(data=outlier_dict, index=dataframe.columns).sort_values(
        by="outlier_count", ascending=False
    )

#### Visualization of outliers

In [None]:
# Boxplots of numerical features to view outliers
df.drop("Target", axis=1).plot.box(subplots=True, figsize=(25, 80), layout=(14, 3))
plt.show()

# Outliers outside of IQR ends +/- 1.5 * IQR for each numerical column
outside_IQRends_pls1pt5(df.drop("Target", axis=1))

#### Observations:
- We will now save our dataset and pick back up in a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean.db")
# df.to_sql("wp_life_expect_clean", conn, index=False)

In [None]:
print('Complete')

# Chime notification when cell executes
chime.success()

# [Proceed to Data Cleaning Part ]()