# Wikipedia Notable Life Expectancies
# [Notebook 11: Basic Linear Regression Model ](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_data_modeling_thanak_2022_10_9.ipynb)
### Context

The
### Objective

The
### Data Dictionary
- Feature: Description

### Importing Libraries

In [1]:
# To structure code automatically
%load_ext nb_black

# To import/export sqlite databases
# import sqlite3 as sql

# To save/open python objects in pickle file
import pickle

# To help with reading, cleaning, and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# randomized data splitting
from sklearn.model_selection import train_test_split

# building regression model
import statsmodels.api as sm

# check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# check linear regression assumptions
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pylab
import scipy.stats as stats
import statsmodels.stats.api as sms
from statsmodels.compat import lzip

# to compare fit between models
from scipy.stats.distributions import chi2

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To define the maximum number of rows to be displayed in a dataframe
pd.set_option("display.max_rows", 211)

# To set some dataframe visualization attributes
pd.set_option("max_colwidth", 150)

# To supress scientific notations for a dataframe
# pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
# import warnings

# warnings.filterwarnings("ignore")

# To set some plot visualization attributes
sns.set_theme()
sns.set(font_scale=1.4)
sns.set_palette(
    (
        "midnightblue",
        "goldenrod",
        "maroon",
        "darkolivegreen",
        "cadetblue",
        "tab:purple",
        "yellowgreen",
    )
)
# plt.rc("font", size=12)
# plt.rc("axes", titlesize=15)
# plt.rc("axes", labelsize=14)
# plt.rc("xtick", labelsize=13)
# plt.rc("ytick", labelsize=13)
# plt.rc("legend", fontsize=13)
# plt.rc("legend", fontsize=14)
# plt.rc("figure", titlesize=16)

# To play auditory cue when cell has executed, has warning, or has error and set chime theme
import chime

chime.theme("zelda")

<IPython.core.display.Javascript object>

## Data Overview

### [Reading](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_preproc.csv), Sampling, and Checking Data Shape

In [2]:
# Reading the train set
data = pd.read_csv("wp_life_expect_preproc.csv")

# Making a working copy
df = data.copy()

# Checking the shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

# Checking first 2 rows of the data
df.head(2)

There are 77661 rows and 6 columns.


Unnamed: 0,age,recip_num_references,region,prior_region,known_for,years
0,73.0,0.25,Europe,No Prior Region,spiritual,8
1,90.0,0.333333,North America,No Prior Region,two,13


<IPython.core.display.Javascript object>

In [3]:
# Checking last 2 rows of the data
df.tail(2)

Unnamed: 0,age,recip_num_references,region,prior_region,known_for,years
77659,74.0,0.142857,North America,No Prior Region,arts,0
77660,92.0,0.2,Europe,No Prior Region,sports,8


<IPython.core.display.Javascript object>

In [4]:
# Checking a sample of the data
df.sample(5)

Unnamed: 0,age,recip_num_references,region,prior_region,known_for,years
56572,89.0,0.2,North America,No Prior Region,arts,23
66438,88.0,0.066667,North America,No Prior Region,spiritual,21
19698,85.0,0.142857,North America,No Prior Region,arts,5
44706,83.0,0.029412,Middle East,No Prior Region,academia_humanities,21
13590,89.0,0.142857,Europe,No Prior Region,arts,9


<IPython.core.display.Javascript object>

### Checking Data Types and Null Values

In [5]:
# Checking data types and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77661 entries, 0 to 77660
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   77661 non-null  float64
 1   recip_num_references  77661 non-null  float64
 2   region                77661 non-null  object 
 3   prior_region          77661 non-null  object 
 4   known_for             77661 non-null  object 
 5   years                 77661 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 3.6+ MB


<IPython.core.display.Javascript object>

#### Observations:
- With our dataset loaded, we are ready for modeling.
- We have two variables that need typcasting from object to category, then one hot encoding.

In [6]:
df["prior_region"].value_counts()

No Prior Region               73352
Europe                         2326
North America                   432
Asia                            341
Russian Federation              279
Africa                          242
Mid-Cent America/Caribbean      202
Middle East                     176
Oceania                         136
South America                   109
South East Asia                  58
Central Asia                      8
Name: prior_region, dtype: int64

<IPython.core.display.Javascript object>

#### Typecasting `prior_region` and `region` as Categorical

In [7]:
# Typcasting prior_region and region as categorical
df[["prior_region", "region"]] = df[["prior_region", "region"]].astype("category")

# Re-check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77661 entries, 0 to 77660
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   age                   77661 non-null  float64 
 1   recip_num_references  77661 non-null  float64 
 2   region                77661 non-null  category
 3   prior_region          77661 non-null  category
 4   known_for             77661 non-null  object  
 5   years                 77661 non-null  int64   
dtypes: category(2), float64(2), int64(1), object(1)
memory usage: 2.5+ MB


<IPython.core.display.Javascript object>

## Building a Linear Regression Model

### Data Preparation for Modeling
As there is no model tuning in this basic linear regression model, we can train directly with the `train` set and check performance direclty on the `test` set.  We will need to do the appropriate treatments on the `test` set first.

### Test Set

#### Loading [Test Set](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/wp_life_expect_test.csv) and [region_place_dict](https://github.com/teresahanak/wikipedia-life-expectancy/blob/main/region_place_dict.pkl)

In [8]:
# Reading test.csv
test = pd.read_csv("wp_life_expect_test.csv")

# Checking shape
print(f"There are {test.shape[0]} rows and {test.shape[1]} columns.")

# Loading region_place_dict
with open("region_place_dict.pkl", "rb") as f:
    region_place_dict = pickle.load(f)

There are 19608 rows and 25 columns.


<IPython.core.display.Javascript object>

In [9]:
# Checking info
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19608 entries, 0 to 19607
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   day                        19608 non-null  int64  
 1   name                       19608 non-null  object 
 2   info                       19608 non-null  object 
 3   link                       19608 non-null  object 
 4   num_references             19608 non-null  int64  
 5   year                       19608 non-null  int64  
 6   month                      19608 non-null  object 
 7   info_parenth               6832 non-null   object 
 8   age                        19608 non-null  float64
 9   cause_of_death             6641 non-null   object 
 10  place_1                    19580 non-null  object 
 11  place_2                    1193 non-null   object 
 12  sciences                   19608 non-null  int64  
 13  social                     19608 non-null  int

<IPython.core.display.Javascript object>

#### Pre-processing Steps

In [19]:
%%time

# 1. Dropping non-human entries
rows_to_drop = test[test["other_species"] == 1].index
test.drop(rows_to_drop, inplace=True)
test.reset_index(inplace=True, drop=True)

# 2. Dropping entries with event_record_other as sole category
rows_to_drop = test[(test['event_record_other']==1) & (test['num_categories']==1)].index
test.drop(rows_to_drop, inplace=True)
test.reset_index(inplace=True, drop=True)

# 3. Creating recip_num_references column
test["recip_num_references"] = test["num_references"].apply(lambda x: 1 / x)

# 4. Creating region column
# Dropping place_2 values that are duplicates of place_1
index = [
    index
    for index in test.index
    if test.loc[index, "place_2"] == test.loc[index, "place_1"]
]
test.loc[index, "place_2"] = None

# For loop to create region column
test["region"] = None
for region, places in region_place_dict.items():
    for place in places:
        for index in test[(test["region"].isna()) & (test["place_2"].notna())].index:
            item = test.loc[index, "place_2"]
            if item:
                if item == place:
                    test.loc[index, "region"] = region
        for index in test[(test["region"].isna()) & (test["place_2"].isna())].index:
            item = test.loc[index, "place_1"]
            if item:
                if item == place:
                    test.loc[index, "region"] = region

# 5. Creating prior_region column
# For loop to create prior_region column
test["prior_region"] = None
for region, places in region_place_dict.items():
    for place in places:
        for index in test[
            (test["place_2"].notna()) & (test["prior_region"].isna())
        ].index:
            item = test.loc[index, "place_1"]
            if item:
                if item == place:
                    test.loc[index, "prior_region"] = region
# Adding No Prior Region category
test["prior_region"].fillna("No Prior Region", inplace=True)

# 6. Typecasting region and prior region as category
test[['region', 'prior_region']] = test[['region', 'prior_region']].astype('category')

# 7. Creating known_for column
# Initializing known_for
test['known_for']=None
# List of known for columns
cols = ['sciences', 'social', 'spiritual', 'academia_humanities',
       'business_farming', 'arts', 'sports', 'law_enf_military_operator',
       'politics_govt_law', 'crime', 'event_record_other', 'other_species']
# For loop to assign known_for
for index in test[test['known_for'].isna()].index:
    if test.loc[index, 'num_categories']==2:
        test.loc[index, 'known_for'] = 'two'
    elif test.loc[index, 'num_categories'] > 2:
        test.loc[index, 'known_for'] = 'three_to_five'
    else: 
        for column in cols:
            if test.loc[index, column]==1:
                test.loc[index, 'known_for'] = column

# 8. Creating years column
test["years"] = test["year"].apply(lambda x: x - 1994)

# 9. Dropping Unnecessary Columns
cols_to_drop = [
   'day', 'name', 'info', 'link', 'num_references', 'year', 'month',
       'info_parenth', 'cause_of_death', 'place_1', 'place_2',
       'sciences', 'social', 'spiritual', 'academia_humanities',
       'business_farming', 'arts', 'sports', 'law_enf_military_operator',
       'politics_govt_law', 'crime', 'event_record_other', 'other_species',
       'num_categories', 
]
test.drop(cols_to_drop, axis=1, inplace=True)

# Rechecking columns
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19430 entries, 0 to 19429
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   age                   19430 non-null  float64 
 1   recip_num_references  19430 non-null  float64 
 2   region                19410 non-null  category
 3   prior_region          19430 non-null  category
 4   known_for             19430 non-null  object  
 5   years                 19430 non-null  int64   
dtypes: category(2), float64(2), int64(1), object(1)
memory usage: 646.0+ KB
CPU times: total: 10.6 s
Wall time: 10.7 s


<IPython.core.display.Javascript object>

In [21]:
# Checking a sample of rows
test.sample(10)

Unnamed: 0,age,recip_num_references,region,prior_region,known_for,years
17169,73.0,0.166667,Europe,No Prior Region,sciences,16
5219,66.0,0.166667,North America,No Prior Region,arts,8
10469,78.0,0.052632,Asia,No Prior Region,two,27
5893,91.0,0.2,Europe,No Prior Region,two,11
1981,92.0,0.333333,Europe,No Prior Region,arts,26
3535,88.0,0.25,Europe,No Prior Region,sports,25
4905,77.0,0.055556,North America,No Prior Region,arts,25
5665,85.0,0.166667,Europe,No Prior Region,sports,2
6997,86.0,0.016667,North America,No Prior Region,arts,17
3153,67.0,0.017544,North America,No Prior Region,arts,16


<IPython.core.display.Javascript object>

#### Observations:
- Treatment of `test` missing values is the only remaining step.
- We will use the modes for the `known_for` groups from `train` to fill missing values for `test` to avoid data leakage.  `region` is the only column with missing values.

#### Treating Missing Values for `region` in Test

In [22]:
# Checking the starting missing values
print(f'There are {test["region"].isna().sum()} missing values.')

There are 20 missing values.


<IPython.core.display.Javascript object>

In [None]:
# For loop to impute test missing values with mode of train rows with matching sole known for category
null_col = "region"
group_rows = df[df[column] == 1].index.tolist()
    group_mode = df.loc[group_rows, null_col].mode()[0]

    for index in test[test.isna()].index:
        if test.loc[index, column] == 1:
            test.loc[index, "region"] = group_mode

# Imputing missing values with train column mode for remaining entries with multiple known for categories
column_mode = df[null_col].mode()[0]
test[null_col].fillna(column_mode, inplace=True)

# Checking the starting missing values
print(f'There are {test["region"].isna().sum()} missing values.')

#### Observations:
- `train` and `test` are both ready for modeling.
- We will perform one hot encoding when defining our independent variables.

#### Defining Independent and Dependent Variables for Train and Test Sets

In [None]:
# Defining independent and dependent variables
X_train, X_test = df.drop("age", axis=1), test.drop("age", axis=1)
y_train, y_test = df["age"], test["age"]


# One hot encoding independent categorical features
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Adding the intercept
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Typecasting independent variables as float
X_train = X_train.astype("float64")
X_test = X_test.astype("float64")

# Checking shape of train and test sets
print(f"There are {X_train.shape[0]} rows and {X_train.shape[1]} columns in X_train.")
print(f"There are {X_test.shape[0]} rows and {X_test.shape[1]} columns in X_test.\n")

# Checking a sample of train set
X_train.sample(5)

#### Observations:
- We are ready to build our model.

### Fitting a Linear Model

In [None]:
# Fitting a linear model and displaying model summary
olsmodel = sm.OLS(y_train, X_train).fit()
print(olsmodel.summary())

### Model Performance Check

#### Functions to check model performance

In [None]:
# function to compute adjusted R-squared
def adj_r2_score(predictors, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = predictors.shape[0]
    k = predictors.shape[1]
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))


# function to compute MAPE
def mape_score(targets, predictions):
    return np.mean(np.abs((targets - predictions) / targets)) * 100


# function to compute and display different metrics to check performance of a regression model
# with conversion back to original scale for RMSE, MAE, and MAPE for ease of explainability
def model_performance_regression(model, predictors, target):
    """
    Function to compute and return a dataframe of different metrics to check
    regression model performance
    
    model: regressor
    predictors: independent variables
    target: dependent variable
    """
    # predictions
    pred = model.predict(predictors)

    r2 = r2_score(target, pred)  # to compute R-squared
    adjr2 = adj_r2_score(predictors, target, pred)  # to compute adjusted R-squared
    rmse = np.sqrt(mean_squared_error(target, pred))  # to compute RMSE
    mae = mean_absolute_error(target, pred)  # to compute MAE
    mape = mape_score(target, pred)  # to compute MAPE

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "RMSE": rmse,
            "MAE": mae,
            "R-squared": r2,
            "Adj. R-squared": adjr2,
            "MAPE": mape,
        },
        index=[0],
    )

    return df_perf

#### Model Performance on Train Set

In [None]:
# Checking model performance on train set
print("Training Performance\n")
olsmodel_train_perf = model_performance_regression(olsmodel, X_train, y_train)
olsmodel_train_perf

### Model Performance on Test Set

In [None]:
# Checking model performance on validation set
print("Test Peformance\n")
olsmodel_test_perf = model_performance_regression(olsmodel, X_test, y_test)
olsmodel_test_perf

#### Observations:
- The train and test $R^2$ are 0.098 and 0.066, respectively, indicating that the model explains 9.8% of total variation in the train set and 6.6% of variation in the test set, so very little.
- RMSE values for the two sets are comparable.

### Checking Linear Regression Assumptions

### Test for Multicollinearity

#### Function to check VIF of Each Independent Variable

In [None]:
# Define a function to check and display VIF of each independent variable
def checking_vif(predictors):
    """
    Takes input dependent variables predictors and returns
    a dataframe of variable name and VIF
    """
    vif = pd.DataFrame()
    vif["feature"] = predictors.columns

    # calculating VIF for each feature
    vif["VIF"] = [
        variance_inflation_factor(predictors.values, i)
        for i in range(len(predictors.columns))
    ]
    return vif

#### VIF

In [None]:
# Checking vif
vif = checking_vif(X_train).sort_values(by="VIF", ascending=False)
vif

#### Function to Treat Multicollinearity

In [None]:
def treating_multicollinearity(predictors, target, high_vif_columns):
    """
    Checking the effect of dropping the columns showing high multicollinearity
    on model performance (adj. R-squared and RMSE)

    predictors: independent variables
    target: dependent variable
    high_vif_columns: columns having high VIF
    """
    # empty lists to store adj. R-squared and RMSE values
    adj_r2 = []
    rmse = []

    # build ols models by dropping one of the high VIF columns at a time
    # store the adjusted R-squared and RMSE in the lists defined previously
    for cols in high_vif_columns:
        # defining the new train set
        train = predictors.loc[:, ~predictors.columns.str.startswith(cols)]

        # create the model
        olsmodel = sm.OLS(target, train).fit()

        # adding adj. R-squared and RMSE to the lists
        adj_r2.append(olsmodel.rsquared_adj)
        rmse.append(np.sqrt(olsmodel.mse_resid))

    # creating a dataframe for the results
    temp = pd.DataFrame(
        {
            "col": high_vif_columns,
            "Adj. R-squared after_dropping col": adj_r2,
            "RMSE after dropping col": rmse,
        }
    ).sort_values(by="Adj. R-squared after_dropping col", ascending=False)
    temp.reset_index(drop=True, inplace=True)

    return temp

#### Checking Impact of Dropping Columns on Model Performance

In [None]:
# Checking impact of dropping columns
col_list = vif[vif["VIF"] >= 5]["feature"].tolist()

results = treating_multicollinearity(X_train, y_train, col_list)
results

#### Dropping `event_record_other`

In [None]:
# Dropping event_record_other column
col_to_drop = "event_record_other"
X_train = X_train.drop(col_to_drop, axis=1)
X_test = X_test.drop(col_to_drop, axis=1)

# Recheck VIF
vif = checking_vif(X_train).sort_values(by="VIF", ascending=False)
vif

#### Checking Impact of Dropping Columns on Model Performance

In [None]:
# Checking impact of dropping columns
col_list = vif[vif["VIF"] >= 5]["feature"].tolist()
col_list.remove("const")

results = treating_multicollinearity(X_train, y_train, col_list)
results

#### Dropping `arts`

In [None]:
# Dropping arts column
col_to_drop = "arts"
X_train = X_train.drop(col_to_drop, axis=1)
X_test = X_test.drop(col_to_drop, axis=1)

# Recheck VIF
vif = checking_vif(X_train).sort_values(by="VIF", ascending=False)
vif

#### Checking Impact of Dropping Columns on Model Performance

In [None]:
# Checking impact of dropping columns
col_list = vif[vif["VIF"] >= 5]["feature"].tolist()
col_list.remove("const")

results = treating_multicollinearity(X_train, y_train, col_list)
results

#### Dropping `prior_region_No Prior Region`

In [None]:
# Dropping prior_region_No Prior Region column
col_to_drop = "prior_region_No Prior Region"
X_train = X_train.drop(col_to_drop, axis=1)
X_test = X_test.drop(col_to_drop, axis=1)

# Recheck VIF
vif = checking_vif(X_train).sort_values(by="VIF", ascending=False)
vif

#### Checking Impact of Dropping Columns on Model Performance

In [None]:
# Checking impact of dropping columns
col_list = vif[vif["VIF"] >= 5]["feature"].tolist()
col_list.remove("const")

results = treating_multicollinearity(X_train, y_train, col_list)
results

#### Dropping `region_Europe`

In [None]:
# Dropping region_Europe column
col_to_drop = "region_Europe"
X_train = X_train.drop(col_to_drop, axis=1)
X_test = X_test.drop(col_to_drop, axis=1)

# Recheck VIF
vif = checking_vif(X_train).sort_values(by="VIF", ascending=False)
vif

#### Fitting Second Model

In [None]:
# Fitting a linear model and displaying model summary
olsmodel2 = sm.OLS(y_train, X_train).fit()
print(olsmodel2.summary())

#### Checking Features with High Coefficient p-values

In [None]:
## Loop for Dropping variables with high coefficient p-values one at a time excluding constant and dummy variables
# Current model predictors
train_predictors = X_train

# initial list of all independent variable columns including constant
cols = train_predictors.columns.to_list()

# initial empty list of independent variables to exclude
excluded_features = []

# setting an initial max p_value
max_p_value = 1

for i in range(len(cols)):
    # defining the train set
    x_train_aux = train_predictors[cols]

    # fitting the model
    model = sm.OLS(y_train, x_train_aux).fit()

    # getting the p-values and the maximum p-value
    p_values = model.pvalues[cols]
    max_p_value = max(p_values)

    # name of the variable with maximum p-value
    feature_with_p_max = p_values.idxmax()

    if max_p_value > 0.05:
        cols.remove(feature_with_p_max)
        excluded_features.append(feature_with_p_max)
    else:
        break

print(f"Features with high p-values to exclude: {excluded_features}")

#### Dropping `excluded_features` with High p-values

In [None]:
# Dropping excluded_features
X_train.drop(excluded_features, axis=1, inplace=True)
X_test.drop(excluded_features, axis=1, inplace=True)

#### Fitting Third Model

In [None]:
# fitting third model and printing summary
olsmodel3 = sm.OLS(y_train, X_train).fit()
print(olsmodel3.summary())

### Test for Linearity and Independence

#### Dataframe of Actual, Fitted, and Residual Values

In [None]:
# Dataframe of actual, fitted, and residual values
df_pred = pd.DataFrame()

df_pred["Actual Values"] = y_train  # actual values
df_pred["Fitted Values"] = olsmodel3.fittedvalues  # predicted values
df_pred["Residuals"] = olsmodel3.resid  # residuals

df_pred.sample(5)

#### Plot of Fitted Values vs Residuals

In [None]:
# Plot of fitted values vs residuals
sns.residplot(
    data=df_pred, x="Fitted Values", y="Residuals", color="purple", lowess=True
)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Fitted vs Residual plot")
plt.show()

### Test for Normality

#### Histogram

In [None]:
# Histogram of residuals
sns.histplot(data=df_pred, x="Residuals", kde=True)
plt.title("Normality of Residuals")
plt.show()

#### Q-Q plot

In [None]:
# checking the Q-Q plot
stats.probplot(df_pred["Residuals"], dist="norm", plot=pylab)
plt.show()

#### Shapiro-Wilk Test

In [None]:
# Shapiro-Wilk Test for normality
test_stat, p_value = stats.shapiro(df_pred["Residuals"])
print(f"p-value = {p_value}")

### Test for Homoscedasticity

#### Goldfeld-Quandt Test

In [None]:
# Goldfeld-quandt test
name = ["F statistic", "p-value"]
test = sms.het_goldfeldquandt(df_pred["Residuals"], X_train)
lzip(name, test)

## Final Model Summary

### *olsmodel3* summary

In [None]:
print(olsmodel3.summary())

#### Final Model Performance on Train Set

In [None]:
# Checking final model performance on train set
print("Training Performance\n")
olsmodel_train_perf = model_performance_regression(olsmodel3, X_train, y_train)
olsmodel_train_perf

#### Final Model Performance on Validation Set

In [None]:
# Checking final model performance on test set
print("Testing Performance\n")
olsmodel_test_perf = model_performance_regression(olsmodel3, X_test, y_test)
olsmodel_test_perf

## Interpreting Model Parameters


In [None]:
# For loop to make list of parameters with p-values < 0.05
# Final model predictors
final_pred = X_train

# initial list of all independent variable columns excluding constant
cols = final_pred.columns[1:]

# initial list of columns to interpret
feat_to_interp = []

# for loop to create list of features with p < 0.05
for feature in cols:
    p_value = olsmodel3.pvalues[feature]
    if p_value < 0.05:
        feat_to_interp.append(feature)

print(
    "Features with interpretable coefficient confidence intervals: \n\n", feat_to_interp
)

#### Interpreting Features with Coefficient p-values < 0.05

In [None]:
print("dunzo!")

# Sound notification when cell executes
chime.success()

#### Observations:
- We will now save our dataset and pick back up in a new notebook.

### Exporting Dataset to SQLite Database [wp_life_expect_clean.db]()

In [None]:
# # Exporting dataframe

# # Saving dataset in a SQLite database
# conn = sql.connect("wp_life_expect_clean.db")
# df.to_sql("wp_life_expect_clean", conn, index=False)

In [None]:
print('Complete')

# Chime notification when cell executes
chime.success()

# [Proceed to Data Cleaning Part ]()