# Uninsured or Not?
Imagine we are being asked by one of our clients, ACME X, to predict if a given individual is uninsured or not (health insurance). Using the data in training_set_data.csv, please train a model to predict whether each person in the data set is uninsured or not. Furthermore, our main contact at ACME X will need to explain the basics of the model to their smart, but non-technical coworkers, so interpretability of how the model inputs relate to the predictions will be important. 

In [None]:
from google.colab import files
import pandas as pd
import io
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
uploaded = files.upload()

Saving training_set_data.csv to training_set_data (1).csv


In [None]:
train = pd.read_csv(io.BytesIO(uploaded['training_set_data.csv']))
train.head()

Unnamed: 0,person_id,age,citizen_status,nativity_status,weekly_hours_worked,total_income,self_employed_income,wage_income,interest_income,other_income,marital_status,school_status,sex,when_last_worked,worked_last_week,language,race_native_american,race_asian,race_black,race_native_hawaiian,race_pacific_islander,race_white,race_other,uninsured,household_id
0,1,45,citizen_birth,native_born,15,908.0,62.0,800.0,11.0,35.0,married,not_student,FEMALE,in_last_year,worked,English,0,0,0,0,0,1,0,0,1
1,2,47,citizen_birth,native_born,50,180499.0,49.0,160007.0,20424.0,19.0,married,not_student,MALE,in_last_year,worked,English,0,0,0,0,0,1,0,0,1
2,4,26,citizen_birth,native_born,0,7367.0,93.0,4.0,20.0,7250.0,never_married,not_student,male,over_five_years_ago,not_reported,English,0,0,1,0,0,0,0,0,2
3,5,42,citizen_naturalized,foreign_born,0,22970.0,51.0,46.0,283.0,22590.0,never_married,public_school,MALE,in_last_five_years,did_not_work,Korean,0,1,0,0,0,0,0,0,3
4,6,79,citizen_naturalized,foreign_born,0,7941.0,23.0,94.0,73.0,7751.0,married,not_student,MALE,over_five_years_ago,did_not_work,Korean,0,1,0,0,0,0,0,0,3


**Looking for most important columns**

In [None]:
train.corr()['uninsured']

person_id               -0.004864
age                     -0.089969
weekly_hours_worked     -0.039366
total_income            -0.108771
self_employed_income    -0.000609
wage_income             -0.089422
interest_income         -0.028752
other_income            -0.077122
race_native_american     0.000047
race_asian               0.013656
race_black               0.056895
race_native_hawaiian    -0.005443
race_pacific_islander    0.008937
race_white              -0.121387
race_other               0.144702
uninsured                1.000000
household_id            -0.004836
Name: uninsured, dtype: float64

In [None]:
train.isna().sum() # no missing data

person_id                0
age                      0
citizen_status           0
nativity_status          0
weekly_hours_worked      0
total_income             0
self_employed_income     0
wage_income              0
interest_income          0
other_income             0
marital_status           0
school_status            0
sex                      0
when_last_worked         0
worked_last_week         0
language                 0
race_native_american     0
race_asian               0
race_black               0
race_native_hawaiian     0
race_pacific_islander    0
race_white               0
race_other               0
uninsured                0
household_id             0
dtype: int64

Can always improve model after performing feature importance after the using model.

**Prepare dataset for models**

In [None]:
# Fix sex column to help model find patterns in the data
train.sex.value_counts()

FEMALE    21551
MALE      20111
F          6131
M          5759
female     3152
male       2893
Name: sex, dtype: int64

In [None]:
train['sex'] = train.sex.replace(['FEMALE', 'female'], 'F')
train['sex'] = train.sex.replace(['MALE', 'male'], 'M')
train.drop(['person_id', 'household_id'], axis = 1, inplace = True)

**Evaluator to aid comparison & visualization process**

In [None]:
def evaluator(y_true, y_pred):
  accuracy = accuracy_score(y_true, y_preds)
  precision = precision_score(y_true, y_preds)
  recall = recall_score(y_true, y_preds)
  f1 = f1_score(y_true, y_preds)
  metrics = {'accuracy': round(accuracy, 2), 'precision': round(precision, 2),
                 'recall': round(recall, 2), 'f1': round(f1, 2)}
  print(f'Acc: {accuracy * 100:.2f}%', f'Precision: {precision:.2f}',
        f'Recall: {recall:.2f}', f'F1 score: {f1:.2f}')
  return metrics

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

X = train.drop('uninsured', axis = 1)
y = train['uninsured']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, random_state=42)
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_valid = onehot_encoder.transform(X_valid)

svc = make_pipeline(StandardScaler(with_mean=False), LinearSVC())
knn = make_pipeline(StandardScaler(with_mean=False), KNeighborsClassifier())
rfc = make_pipeline(RandomForestClassifier()) # doesn't need scaler            
xgb = make_pipeline(XGBClassifier())

In [None]:
svc.fit(X_train, y_train)
svc.score(X_valid, y_valid) # 0.83498322147651



0.8781040268456376

In [None]:
knn.fit(X_train, y_train)
knn.score(X_valid, y_valid) # 0.9122483221476511

0.9122483221476511

In [None]:
rfc.fit(X_train, y_train)
rfc.score(X_valid, y_valid) # 0.9268456375838926

0.9270973154362416

In [None]:
# Ensembles are showing to be the best models for the data
xgb.fit(X_train, y_train)
xgb.score(X_valid, y_valid) # 0.9274328859060402

0.9274328859060402

**Predict with best model**

In [None]:
xgb_y_pred = xgb.predict(X_valid)
xgb_metrics = evaluator(y_valid, y_pred)

Acc: 92.74% Precision: 0.55 Recall: 0.08 F1 score: 0.15
