# Wikipedia Notable Life Expectancies
# [Notebook 11: Data Pre-processing ](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_preproc_2022_10_06.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
import sqlite3 as sql

# To save/open python objects in pickle file
# import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# randomized data splitting
from sklearn.model_selection import train_test_split

# building regression model
import statsmodels.api as sm

# check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# check linear regression assumptions
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pylab
import scipy.stats as stats
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

# to compare fit between models
from scipy.stats.distributions import chi2

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 211)

# To set some dataframe visualization attributes
pd.set_option("max_colwidth", 150)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some plot visualization attributes
sns.set_theme()
sns.set_palette(
    (
        "midnightblue",
        "goldenrod",
        "maroon",
        "darkolivegreen",
        "cadetblue",
        "tab:purple",
        "yellowgreen",
    )
)
plt.rc("font", size=12)
plt.rc("axes", titlesize=15)
plt.rc("axes", labelsize=14)
plt.rc("xtick", labelsize=13)
plt.rc("ytick", labelsize=13)
plt.rc("legend", fontsize=13)
plt.rc("legend", fontsize=14)
plt.rc("figure", titlesize=16)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_EDA.db), Sampling, and Checking Data Shape

In [2]:
# Reading the dataset
conn = sql.connect("wp_life_expect_EDA.db")
data = pd.read_sql("SELECT * FROM wp_life_expect_EDA", conn)

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 97549 rows and 25 columns.


Unnamed: 0,info,link,num_references,year,info_parenth,age,cause_of_death,place_1,place_2,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,recip_num_references,region,prior_region,known_for
0,", 86, British dancer, ballet designer and director.",https://en.wikipedia.org/wiki/William_Chappell_(dancer),21,1994,,86.0,,United Kingdom of Great Britain and Northern Ireland,,0,0,0,0,0,1,0,0,0,0,0,1,0.048,Europe,,arts
1,", 68, Irish economist, writer, and academic.",https://en.wikipedia.org/wiki/Raymond_Crotty,12,1994,,68.0,,Ireland,,0,0,0,1,0,1,0,0,1,0,0,3,0.083,Europe,,three_to_five


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,info,link,num_references,year,info_parenth,age,cause_of_death,place_1,place_2,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,recip_num_references,region,prior_region,known_for
97547,", 50, Pakistani journalist and politician, MNA .",https://en.wikipedia.org/wiki/Aamir_Liaquat_Hussain,99,2022,"2002 2007, since 2018",50.0,,Pakistan,,0,0,0,0,0,1,0,0,1,0,0,2,0.01,Asia,,two
97548,", 86, Chinese engineer, member of the Chinese Academy of Engineering.",https://en.wikipedia.org/wiki/Zou_Jing_(engineer),3,2022,,86.0,,"China, People's Republic of",,1,0,0,0,0,0,0,0,0,0,0,1,0.333,Asia,,sciences


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,info,link,num_references,year,info_parenth,age,cause_of_death,place_1,place_2,sciences,social,spiritual,academia_humanities,business_farming,arts,sports,law_enf_military_operator,politics_govt_law,crime,event_record_other,num_categories,recip_num_references,region,prior_region,known_for
88596,", 44, Irish filmmaker and cycling advocate.",https://en.wikipedia.org/wiki/Paddy_Cahill,10,2021,,44.0,,Ireland,,0,0,0,0,0,1,1,0,0,0,0,2,0.1,Europe,,two
86444,", 86, British railway preservationist and writer.",https://en.wikipedia.org/wiki/Vic_Mitchell,6,2021,,86.0,,United Kingdom of Great Britain and Northern Ireland,,0,0,0,1,0,1,0,0,0,0,0,2,0.167,Europe,,two
63154,", 89, English Olympic clergyman .",https://en.wikipedia.org/wiki/Nicolas_Stacey,6,2017,1952.0,89.0,,United Kingdom of Great Britain and Northern Ireland,,0,0,0,0,0,0,1,0,0,0,0,1,0.167,Europe,,sports
1485,", 85, American politician and Secretary of State, heart failure.",https://en.wikipedia.org/wiki/Dean_Rusk,135,1994,,85.0,heart failure,United States of America,,0,0,0,0,0,0,0,0,1,0,0,1,0.007,North America,,politics_govt_law
2344,", 91, Norwegian poet, essayist and professor of American literature.",https://en.wikipedia.org/wiki/Sigmund_Skard,7,1995,,91.0,,Norway,,0,0,0,1,0,1,0,0,0,0,0,2,0.143,Europe,,two


<IPython.core.display.Javascript object>

### Checking Data Types and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97549 entries, 0 to 97548
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   info                       97549 non-null  object 
 1   link                       97549 non-null  object 
 2   num_references             97549 non-null  int64  
 3   year                       97549 non-null  int64  
 4   info_parenth               36536 non-null  object 
 5   age                        97549 non-null  float64
 6   cause_of_death             33179 non-null  object 
 7   place_1                    97406 non-null  object 
 8   place_2                    5442 non-null   object 
 9   sciences                   97549 non-null  int64  
 10  social                     97549 non-null  int64  
 11  spiritual                  97549 non-null  int64  
 12  academia_humanities        97549 non-null  int64  
 13  business_farming           97549 non-null  int

<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we begin data pre-processing.

#### Observations:
- We will now save our dataset and pick back up in a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean.db")
# df.to_sql("wp_life_expect_clean", conn, index=False)

In [None]:
print('Complete')

# Chime notification when cell executes
chime.success()

# [Proceed to Data Cleaning Part ]()