# WHO Project - Model Test (HW)

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# The train-test split function - we need to use this before we create the model
from sklearn.model_selection import train_test_split

# We will be using statsmodels for LinReg and then metrics to measure its performance
import statsmodels.api as sm    # Linear regression
import statsmodels.tools        # Evaluation metrics

In [2]:
df = pd.read_csv("Life Expectancy Data.csv")
df.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,...,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,...,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,India,Asia,2007,51.5,67.9,201.0765,1.57,60,35,21.2,...,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,Guyana,South America,2006,32.8,40.5,222.1965,5.68,93,74,25.3,...,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,...,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [3]:
df.Region.unique()

array(['Middle East', 'European Union', 'Asia', 'South America',
       'Central America and Caribbean', 'Rest of Europe', 'Africa',
       'Oceania', 'North America'], dtype=object)

## Separate Features and Target

In [None]:
feature_cols = list(df.columns)
feature_cols.remove('Life_expectancy')
feature_cols

In [None]:
# Create X, and y - just using masks on the dataframe that we learned about in Pandas

X = df[feature_cols]         # our set of features
y = df['Life_expectancy']    # our target

## Train-Test Split

In [None]:
# Now that we have separated our features from the target - we do the train-test split

X_train, X_test, y_train, y_test = train_test_split(X,               # Features
                                                    y,               # Target
                                                    test_size = 0.2, # allocate 20% for test
                                                    random_state = 1000)

In [None]:
## Next code cells are sanity checks for TEST-TRAIN split
print(f'Indices match in X_train and y_train: {all(X_train.index == y_train.index)}')
print(f'Indices match in X_test and y_test: {all(X_test.index == y_test.index)}')
print(f'Number of obs. match in X_train and y_train: {X_train.shape[0] == len(y_train)}')
print(f'Number of obs. match in X_test and y_test: {X_test.shape[0] == len(y_test)}')

## Feature Engineering

In [None]:
## We define a (reusable) function which will carry out our feature engineering

def feature_engineer(df):
    df = df.copy()       # IMPORTANT! - don't want to interact with the original, global dataframe

    # Now we can OHE the 'Region' column
    df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'Region', dtype = int)

    # Other Features that we wanted to change
    df['log_gdp'] = np.log(df['GDP_per_capita'])                                         # creates an additional column for the log of GDP since it was a non-linear relationship
    df['immunisation_avg'] = (df['Polio'] + df['Diphtheria'] + df['Hepatitis_B']) / 3    # averages the immunisation rates into one feature to avoid multicollinearity 

    # VITAL!!! - we MUST also add the constant term for linreg to run in statsmodels... 
    df = sm.add_constant(df)

    # Return the Feature Engineered version of the dataframe
    return df