In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read in CSV and create pandas dataframe
csv = "/Users/tymberhamilton/Desktop/Modules/Project_Final/UoM_Project_4/SQL_Joined_Tables_2.csv"
df = pd.read_csv(csv)

# Review the dataframe
df.tail()

Unnamed: 0,id,entity,code,year_1,current_health_expenditure_per_capita_ppp_current_international,gdp_per_capita_ppp_current_international_$,population_historical_estimates,id-2,entity-2,code-2,...,population_historical_estimates-2,id-3,entity-3,code-3,year_3,indicator_current_health_expenditure_che_as_percentage_of_gross,id-4,entity-4,code-4,share_of_population_covered_by_health_insurance_ilo_2014
2874,55864,Zimbabwe,ZWE,2014,212.490036,2612.456055,13586710,55902,Zimbabwe,ZWE,...,13586710,3968,Zimbabwe,ZWE,2014,8.13,161,Zimbabwe,ZWE,1.0
2875,55865,Zimbabwe,ZWE,2015,199.677658,2679.507568,13814642,55903,Zimbabwe,ZWE,...,13814642,3969,Zimbabwe,ZWE,2015,7.45,161,Zimbabwe,ZWE,1.0
2876,55866,Zimbabwe,ZWE,2016,215.404556,2806.468994,14030338,55904,Zimbabwe,ZWE,...,14030338,3970,Zimbabwe,ZWE,2016,7.68,161,Zimbabwe,ZWE,1.0
2877,55867,Zimbabwe,ZWE,2017,226.205872,3795.642334,14236599,55905,Zimbabwe,ZWE,...,14236599,3971,Zimbabwe,ZWE,2017,7.47,161,Zimbabwe,ZWE,1.0
2878,55868,Zimbabwe,ZWE,2018,269.203705,4017.22168,14438812,55906,Zimbabwe,ZWE,...,14438812,3972,Zimbabwe,ZWE,2018,8.68,161,Zimbabwe,ZWE,1.0


In [3]:
# Check the shape
df.shape


(2879, 23)

In [4]:
# Checking column names
df.columns


Index(['id', 'entity', 'code', 'year_1',
       'current_health_expenditure_per_capita_ppp_current_international',
       'gdp_per_capita_ppp_current_international_$',
       'population_historical_estimates', 'id-2', 'entity-2', 'code-2',
       'year_4', 'mortality_rate_under_5_per_1000_live_births',
       'current_health_expenditure_per_capita_ppp_current_international-2',
       'population_historical_estimates-2', 'id-3', 'entity-3', 'code-3',
       'year_3',
       'indicator_current_health_expenditure_che_as_percentage_of_gross',
       'id-4', 'entity-4', 'code-4',
       'share_of_population_covered_by_health_insurance_ilo_2014'],
      dtype='object')

In [5]:
# Dropping duplicate columns
df = df.drop(columns=['id-2','entity-2','code-2','year_4','id-3','entity-3','code-3', 'year_3', 'id-4', 'entity-4', 'code-4'], axis=1)

In [6]:
# Checking to see if .drop() ran.
df.tail()

Unnamed: 0,id,entity,code,year_1,current_health_expenditure_per_capita_ppp_current_international,gdp_per_capita_ppp_current_international_$,population_historical_estimates,mortality_rate_under_5_per_1000_live_births,current_health_expenditure_per_capita_ppp_current_international-2,population_historical_estimates-2,indicator_current_health_expenditure_che_as_percentage_of_gross,share_of_population_covered_by_health_insurance_ilo_2014
2874,55864,Zimbabwe,ZWE,2014,212.490036,2612.456055,13586710,6.27,212.490036,13586710,8.13,1.0
2875,55865,Zimbabwe,ZWE,2015,199.677658,2679.507568,13814642,6.13,199.677658,13814642,7.45,1.0
2876,55866,Zimbabwe,ZWE,2016,215.404556,2806.468994,14030338,5.87,215.404556,14030338,7.68,1.0
2877,55867,Zimbabwe,ZWE,2017,226.205872,3795.642334,14236599,5.7,226.205872,14236599,7.47,1.0
2878,55868,Zimbabwe,ZWE,2018,269.203705,4017.22168,14438812,5.48,269.203705,14438812,8.68,1.0


In [7]:
# Target is going to be child mortality
df.columns

Index(['id', 'entity', 'code', 'year_1',
       'current_health_expenditure_per_capita_ppp_current_international',
       'gdp_per_capita_ppp_current_international_$',
       'population_historical_estimates',
       'mortality_rate_under_5_per_1000_live_births',
       'current_health_expenditure_per_capita_ppp_current_international-2',
       'population_historical_estimates-2',
       'indicator_current_health_expenditure_che_as_percentage_of_gross',
       'share_of_population_covered_by_health_insurance_ilo_2014'],
      dtype='object')

In [8]:
# Scale the data
df_scaled = StandardScaler().fit_transform(
    df[['year_1',
       'current_health_expenditure_per_capita_ppp_current_international',
       'gdp_per_capita_ppp_current_international_$',
       'population_historical_estimates',
       'mortality_rate_under_5_per_1000_live_births',
       'current_health_expenditure_per_capita_ppp_current_international-2',
       'population_historical_estimates-2',
       'indicator_current_health_expenditure_che_as_percentage_of_gross',
       'share_of_population_covered_by_health_insurance_ilo_2014']])

In [9]:
scaled_data = pd.DataFrame(
    df_scaled,\
    columns=['year_1',
       'current_health_expenditure_per_capita_ppp_current_international',
       'gdp_per_capita_ppp_current_international_$',
       'population_historical_estimates',
       'mortality_rate_under_5_per_1000_live_births',
       'current_health_expenditure_per_capita_ppp_current_international-2',
       'population_historical_estimates-2',
       'indicator_current_health_expenditure_che_as_percentage_of_gross',
       'share_of_population_covered_by_health_insurance_ilo_2014'])

In [10]:
# scaled_data.head()

In [11]:
scaled_data['entity'] = df['entity']
scaled_data = scaled_data.set_index('entity')
scaled_data.head()

Unnamed: 0_level_0,year_1,current_health_expenditure_per_capita_ppp_current_international,gdp_per_capita_ppp_current_international_$,population_historical_estimates,mortality_rate_under_5_per_1000_live_births,current_health_expenditure_per_capita_ppp_current_international-2,population_historical_estimates-2,indicator_current_health_expenditure_che_as_percentage_of_gross,share_of_population_covered_by_health_insurance_ilo_2014
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Albania,-1.64771,-0.616298,-0.694518,-0.259752,-0.227911,-0.616298,-0.259752,0.45307,-0.976432
Albania,-1.465054,-0.59928,-0.672144,-0.259749,-0.261938,-0.59928,-0.259749,0.416098,-0.976432
Albania,-1.282398,-0.589512,-0.653691,-0.259772,-0.295965,-0.589512,-0.259772,0.321613,-0.976432
Albania,-1.099742,-0.568921,-0.636421,-0.259826,-0.329992,-0.568921,-0.259826,0.383234,-0.976432
Albania,-0.917086,-0.559071,-0.614611,-0.259912,-0.366449,-0.559071,-0.259912,0.264101,-0.976432
