In [1]:
# Import Packages
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [2]:
# open data file into a Pandas Data Frame
df_insurance = pd.read_csv('insurance.csv')
df_insurance.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [3]:
# discretize charges into a numerical charge code where 0 is low and 1 is high
df_insurance['charges_code'] = pd.cut(df_insurance['charges'],bins=[0,10000, float('inf')], include_lowest=True, labels=[0,1])

# drop origianl charges feature
df_insurance.drop('charges', axis=1, inplace=True)

# display top 10 rows
df_insurance.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_code
0,19,female,27.9,0,yes,southwest,1
1,18,male,33.77,1,no,southeast,0
2,28,male,33.0,3,no,southeast,0
3,33,male,22.705,0,no,northwest,1
4,32,male,28.88,0,no,northwest,0
5,31,female,25.74,0,no,southeast,0
6,46,female,33.44,1,no,southeast,0
7,37,female,27.74,3,no,northwest,0
8,37,male,29.83,2,no,northeast,0
9,60,female,25.84,0,no,northwest,1


In [4]:
# Recode region column as dummy using 'southwest' as the reference group
# 4 regions mean that there are three dummy features, one for each region other than 'southwest'
df_dummies = pd.get_dummies(df_insurance, columns=['region'])
df_insurance = df_dummies.drop(['region_southwest'], axis=1)

# Recode smoker column as dummy using 'yes' as the reference group
# Note that I am using a different technique here becasue it is easier as we have only
# categories, meaning only one feature 
smoker_mapping = {'yes':0
                  ,'no':1
                  }
df_insurance = df_insurance.assign(smoker = df_insurance.smoker.map(smoker_mapping))

# Recode sex column as dummy using 'female' as the reference group
sex_mapping = {'female':0
               ,'male':1
               }
df_insurance = df_insurance.assign(sex = df_insurance.sex.map(sex_mapping))

df_insurance.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,charges_code,region_northeast,region_northwest,region_southeast
0,19,0,27.9,0,0,1,0,0,0
1,18,1,33.77,1,1,0,0,0,1
2,28,1,33.0,3,1,0,0,0,1
3,33,1,22.705,0,1,1,0,1,0
4,32,1,28.88,0,1,0,0,1,0
5,31,0,25.74,0,1,0,0,0,1
6,46,0,33.44,1,1,0,0,0,1
7,37,0,27.74,3,1,0,0,1,0
8,37,1,29.83,2,1,0,1,0,0
9,60,0,25.84,0,1,1,0,1,0


In [5]:
# Split data into features and target
feature_names = list(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast','region_northwest','region_southeast'])
X = df_insurance[feature_names]

target_name = list(['charges_code'])
y = df_insurance[target_name]

print(X.head(10))
print(y.head(10))



   age  sex     bmi  children  smoker  region_northeast  region_northwest  \
0   19    0  27.900         0       0                 0                 0   
1   18    1  33.770         1       1                 0                 0   
2   28    1  33.000         3       1                 0                 0   
3   33    1  22.705         0       1                 0                 1   
4   32    1  28.880         0       1                 0                 1   
5   31    0  25.740         0       1                 0                 0   
6   46    0  33.440         1       1                 0                 0   
7   37    0  27.740         3       1                 0                 1   
8   37    1  29.830         2       1                 1                 0   
9   60    0  25.840         0       1                 0                 1   

   region_southeast  
0                 0  
1                 1  
2                 1  
3                 0  
4                 0  
5                 1 

In [6]:
# Do a train test split of 30% with random state of 101
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

In [7]:
# Create and Fit Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None)

In [8]:
# Generate training set predictions based on the model
y_train_pred_nb = nb.predict(X_train)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_train, y_train_pred_nb, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_charges', 'low_charges'],
                         columns=['predicted_high_charges','predicted_low_charges'])
print(confusion)
print()
print(classification_report(y_train, y_train_pred_nb)) 

accuracy = accuracy_score(y_true=y_train, y_pred=y_train_pred_nb)
accuracy.round(3)

              predicted_high_charges  predicted_low_charges
high_charges                     203                    241
low_charges                        0                    492

             precision    recall  f1-score   support

          0       0.67      1.00      0.80       492
          1       1.00      0.46      0.63       444

avg / total       0.83      0.74      0.72       936



0.743

In [9]:
# Generate training set predictions based on the model
y_test_pred_nb = nb.predict(X_test)
confusion_matrix(y_test,y_test_pred_nb)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_test, y_test_pred_nb, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_charges', 'low_charges'],
                         columns=['predicted_high_charges','predicted_low_charges'])
print(confusion)
print()
print(classification_report(y_test, y_test_pred_nb)) 

accuracy = accuracy_score(y_true=y_test, y_pred=y_test_pred_nb)
accuracy.round(3)

              predicted_high_charges  predicted_low_charges
high_charges                      71                    111
low_charges                        0                    220

             precision    recall  f1-score   support

          0       0.66      1.00      0.80       220
          1       1.00      0.39      0.56       182

avg / total       0.82      0.72      0.69       402



0.724

#### Interpretation (Naive Bayes)
The model seems to be slightly less accurate when working with the testing data versus the training data. This appears to be primarily due to problems predicting high charges. While the precision for predicting high charges is perfect, meaning that all of the charges predicted as high charges actually are high charges, a substantial number of low charges were misclassified as high (poor recall).  This means that many of the insurance applicaitons would be rated higher than their actual risk.

In [10]:
# Create and Fit logistic regression model
log = LogisticRegression()
log.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
# Generate predictions based on the model
y_train_pred_log = log.predict(X_train)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_train, y_train_pred_log, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_charges', 'low_charges'],
                         columns=['predicted_high_charges','predicted_low_charges'])
print(confusion)
print()
print(classification_report(y_train, y_train_pred_log)) 

accuracy = accuracy_score(y_true=y_train, y_pred=y_train_pred_log)
accuracy.round(3)

              predicted_high_charges  predicted_low_charges
high_charges                     400                     44
low_charges                       39                    453

             precision    recall  f1-score   support

          0       0.91      0.92      0.92       492
          1       0.91      0.90      0.91       444

avg / total       0.91      0.91      0.91       936



0.911

In [12]:
# print model intercept and coeficcients
print('intercept: {}'.format(log.intercept_))
print('coefficients (log odds) (age, sex, bmi, children, smoker, region_northeast,region_northwest,region_southeast): {}'.format(log.coef_))
print('coefficients (odds ratio) (age, sex, bmi, children, smoker, region_northeast,region_northwest,region_southeast): {}'.format(np.exp(log.coef_)))

intercept: [-0.86478952]
coefficients (log odds) (age, sex, bmi, children, smoker, region_northeast,region_northwest,region_southeast): [[ 0.12781757 -0.18623086  0.01665624  0.01781049 -5.98785936  0.44403063
   0.45508098  0.25752791]]
coefficients (odds ratio) (age, sex, bmi, children, smoker, region_northeast,region_northwest,region_southeast): [[1.13634568 0.83008194 1.01679573 1.01797004 0.00250903 1.55897824
  1.57630102 1.29372791]]


In [13]:
# Generate predictions based on the model
y_test_pred_log = log.predict(X_test)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_test, y_test_pred_log, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_charges', 'low_charges'],
                         columns=['predicted_high_charges','predicted_low_charges'])
print(confusion)
print()
print(classification_report(y_test, y_test_pred_log)) 

accuracy = accuracy_score(y_true=y_test, y_pred=y_test_pred_log)
accuracy.round(3)

              predicted_high_charges  predicted_low_charges
high_charges                     162                     20
low_charges                       35                    185

             precision    recall  f1-score   support

          0       0.90      0.84      0.87       220
          1       0.82      0.89      0.85       182

avg / total       0.87      0.86      0.86       402



0.863

#### Interpretation (Logistic Regression)
The model seems to be even less accurate (compared to Naive Bayes) when working with the testing data versus the training data. However the balance between precision and recall is pretty good. Predicting high charges is still a bit more challenging that predicting low charge, however.

The log odds coefficients for sex and smoker are negative. Since smoker = yes is coded as 0, this means that smokers tend to have higher charges and he movement from smoker to non-smoker (0 to 1) shows a negative effect. The same is true for sex, where a move from females to males has a negative effect on charges. The rest of the coefficients are positive, indicateing a positive relationship between the variable and charges.  Note that since the southwest region is the reference case, all other regions experience higher charges than the southwest region.

Using the odds ratio conversion of the coefficients, which is easier to interpret, we see that the value of the coefficient indicates the incresaing (or decreasing) probability of being classified as high charges with a one unit change in the predictor value, holding all other predictors constant. For example, the odds ratio coefficient of age is 1.14, indicating that for each additional year in age, the odds of being classified as high charges increases by 1.14 times. Note that sex and smoker are now positive in the odds ratio interpretation (they were negative for log odds), however the values are less than one, meaning that there is a decreasing marginal effect.

#### Summary (Comparison)
It appears that logistic regression is handling this data better than naive bayes. The accuracy and precision/recall balance is better than the Bayesian approach.  Even though there is slightly greater imbalance between training and testing for logistic regression, the testing accuracy for logistic regression is better that for Naive Bayes, so I would say that logistic regression is better for this data set.

However, we may also want to ask ourselves why this is the case? Why is Naive Bayes performing poorly by comparison? At least part of the problem is the fact that the assumptopn of a normal distribution is not supported for this current data set.  Baysian models are very sensitive to the underlying data distribution.  The further it strays from a standard normal distribution, the stranger the results tend to become.