In [1]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
insurance = pd.read_csv('insurance.csv').drop_duplicates()

In [3]:
conditions = [
    (insurance['bmi'] <= 18.49),
    (insurance['bmi'] >= 18.50) & (insurance['bmi'] <= 24.99),
    (insurance['bmi'] >= 25 ) & (insurance['bmi'] <= 29.99),
    (insurance['bmi'] >= 30)
    ]

# create a list of the values we want to assign for each condition 
values = ['Underweight', 'Healthy', 'Overweight', 'Obese']

# create a new column and use np.select to assign values to it using our lists as arguments
insurance['bmiCat'] = np.select(conditions, values)

# display updated DataFrame
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmiCat
0,19,female,27.9,0,yes,southwest,16884.924,Overweight
1,18,male,33.77,1,no,southeast,1725.5523,Obese
2,28,male,33.0,3,no,southeast,4449.462,Obese
3,33,male,22.705,0,no,northwest,21984.47061,Healthy
4,32,male,28.88,0,no,northwest,3866.8552,Overweight


In [4]:
ageConditions = [
    (insurance['age'] <= 29),
    (insurance['age'] >= 30) & (insurance['age'] <= 39),
    (insurance['age'] >= 40 ) & (insurance['age'] <= 49),
    (insurance['age'] >= 50 ) & (insurance['age'] <= 59),
    (insurance['age'] >= 60)
    ]

# create a list of the values we want to assign for each condition 
ageValues = ['18-29', '30-39', '40-49', '50-59','60-65']

# create a new column and use np.select to assign values to it using our lists as arguments
insurance['ageCategory'] = np.select(ageConditions, ageValues)

# display updated DataFrame
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmiCat,ageCategory
0,19,female,27.9,0,yes,southwest,16884.924,Overweight,18-29
1,18,male,33.77,1,no,southeast,1725.5523,Obese,18-29
2,28,male,33.0,3,no,southeast,4449.462,Obese,18-29
3,33,male,22.705,0,no,northwest,21984.47061,Healthy,30-39
4,32,male,28.88,0,no,northwest,3866.8552,Overweight,30-39


In [5]:
insurance['chargesQuant']=insurance['charges'].rank(pct=True).round(2)
print(insurance)

      age     sex     bmi  children smoker     region      charges  \
0      19  female  27.900         0    yes  southwest  16884.92400   
1      18    male  33.770         1     no  southeast   1725.55230   
2      28    male  33.000         3     no  southeast   4449.46200   
3      33    male  22.705         0     no  northwest  21984.47061   
4      32    male  28.880         0     no  northwest   3866.85520   
...   ...     ...     ...       ...    ...        ...          ...   
1333   50    male  30.970         3     no  northwest  10600.54830   
1334   18  female  31.920         0     no  northeast   2205.98080   
1335   18  female  36.850         0     no  southeast   1629.83350   
1336   21  female  25.800         0     no  southwest   2007.94500   
1337   61  female  29.070         0    yes  northwest  29141.36030   

          bmiCat ageCategory  chargesQuant  
0     Overweight       18-29          0.75  
1          Obese       18-29          0.04  
2          Obese       1

In [6]:
conditionsRisk = [
    (insurance['chargesQuant'] < .70),
    (insurance['chargesQuant'] >= .70)
    ]

# create a list of the values we want to assign for each condition 
valuesRisk = ['Normal Risk', 'High Risk']

# create a new column and use np.select to assign values to it using our lists as arguments
insurance['Risk Classification'] = np.select(conditionsRisk, valuesRisk)

# display updated DataFrame
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmiCat,ageCategory,chargesQuant,Risk Classification
0,19,female,27.9,0,yes,southwest,16884.924,Overweight,18-29,0.75,High Risk
1,18,male,33.77,1,no,southeast,1725.5523,Obese,18-29,0.04,Normal Risk
2,28,male,33.0,3,no,southeast,4449.462,Obese,18-29,0.23,Normal Risk
3,33,male,22.705,0,no,northwest,21984.47061,Healthy,30-39,0.82,High Risk
4,32,male,28.88,0,no,northwest,3866.8552,Overweight,30-39,0.19,Normal Risk


In [7]:
insurance['target'] = 0.0
mask = (insurance["Risk Classification"] == "High Risk")
insurance['target'][mask] = 1.0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  insurance['target'][mask] = 1.0


In [8]:
insurance['target'].value_counts()

0.0    929
1.0    408
Name: target, dtype: int64

In [9]:
categorical_columns = ['sex', 'children', 'smoker', 'region',
          'bmiCat', 'ageCategory']
insurance_dummies = pd.get_dummies(insurance[categorical_columns], #creates dummy variable for all categorical columns
                            prefix=categorical_columns, #Calls categorical column then the bin we created above as title
                            columns=categorical_columns)
insurance_column_names = insurance_dummies.columns.values
insurance_column_names

array(['sex_female', 'sex_male', 'children_0', 'children_1', 'children_2',
       'children_3', 'children_4', 'children_5', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'bmiCat_Healthy',
       'bmiCat_Obese', 'bmiCat_Overweight', 'bmiCat_Underweight',
       'ageCategory_18-29', 'ageCategory_30-39', 'ageCategory_40-49',
       'ageCategory_50-59', 'ageCategory_60-65'], dtype=object)

In [10]:
df2 = pd.concat([insurance, insurance_dummies], axis=1)
df2

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmiCat,ageCategory,chargesQuant,...,region_southwest,bmiCat_Healthy,bmiCat_Obese,bmiCat_Overweight,bmiCat_Underweight,ageCategory_18-29,ageCategory_30-39,ageCategory_40-49,ageCategory_50-59,ageCategory_60-65
0,19,female,27.900,0,yes,southwest,16884.92400,Overweight,18-29,0.75,...,1,0,0,1,0,1,0,0,0,0
1,18,male,33.770,1,no,southeast,1725.55230,Obese,18-29,0.04,...,0,0,1,0,0,1,0,0,0,0
2,28,male,33.000,3,no,southeast,4449.46200,Obese,18-29,0.23,...,0,0,1,0,0,1,0,0,0,0
3,33,male,22.705,0,no,northwest,21984.47061,Healthy,30-39,0.82,...,0,1,0,0,0,0,1,0,0,0
4,32,male,28.880,0,no,northwest,3866.85520,Overweight,30-39,0.19,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,Obese,50-59,0.56,...,0,0,1,0,0,0,0,0,1,0
1334,18,female,31.920,0,no,northeast,2205.98080,Obese,18-29,0.09,...,0,0,1,0,0,1,0,0,0,0
1335,18,female,36.850,0,no,southeast,1629.83350,Obese,18-29,0.02,...,0,0,1,0,0,1,0,0,0,0
1336,21,female,25.800,0,no,southwest,2007.94500,Overweight,18-29,0.07,...,1,0,0,1,0,1,0,0,0,0


In [11]:
formula = 'target ~ 0 + {}'.format(' + '.join(['Q("{}")'.format(x) for x in insurance_column_names]))
print(formula) #Q tells pandas that its a quote. Needed for column names or else dmatrices will be confused
#Understanding column names

target ~ 0 + Q("sex_female") + Q("sex_male") + Q("children_0") + Q("children_1") + Q("children_2") + Q("children_3") + Q("children_4") + Q("children_5") + Q("smoker_no") + Q("smoker_yes") + Q("region_northeast") + Q("region_northwest") + Q("region_southeast") + Q("region_southwest") + Q("bmiCat_Healthy") + Q("bmiCat_Obese") + Q("bmiCat_Overweight") + Q("bmiCat_Underweight") + Q("ageCategory_18-29") + Q("ageCategory_30-39") + Q("ageCategory_40-49") + Q("ageCategory_50-59") + Q("ageCategory_60-65")


In [12]:
Y, X = dmatrices(formula,df2, return_type = 'dataframe')

In [13]:
y = Y['target'].values
y[:10]

array([1., 0., 0., 1., 0., 0., 0., 0., 0., 1.])

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [15]:
from sklearn import naive_bayes
model = naive_bayes.MultinomialNB()

In [16]:
model.fit(X,y)

MultinomialNB()

In [17]:
print('Prediction')
print(model.predict(X[:10]))
print('Actual')
print(y[:10])

Prediction
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Actual
[1. 0. 0. 1. 0. 0. 0. 0. 0. 1.]


In [18]:
from sklearn import metrics
prediction_train = model.predict(X)
print(metrics.accuracy_score(y,prediction_train))

0.9012715033657442


In [19]:
prediction_test = model.predict(X_test)
print(metrics.accuracy_score(y_test, prediction_test))

0.9102803738317757


In [20]:
print('Prior probability for the normal risk class is',)
print(exp(model.class_log_prior_[0]))
print('Prior probability for the high risk class is',)
print(exp(model.class_log_prior_[1]))

Prior probability for the normal risk class is
0.6948391922213908
Prior probability for the high risk class is
0.3051608077786087


In [21]:
insurance['target'].value_counts() / len(insurance)

0.0    0.694839
1.0    0.305161
Name: target, dtype: float64

In [22]:
feature_importances = abs(model.feature_log_prob_[1] - model.feature_log_prob_[0])
feature_importances

array([0.0818758 , 0.07305267, 0.05884663, 0.16181596, 0.16754422,
       0.31683252, 0.28697956, 0.91699324, 1.10384604, 5.73758874,
       0.12905834, 0.08583478, 0.1973674 , 0.29327457, 0.01088072,
       0.07962922, 0.12225609, 0.40616762, 0.16322144, 0.19530975,
       0.12121203, 0.17163192, 1.49785159])

In [23]:
feature_importance_series = Series(feature_importances, index=X.columns.values)
feature_importance_series.sort_values(ascending = False)[:10]

Q("smoker_yes")            5.737589
Q("ageCategory_60-65")     1.497852
Q("smoker_no")             1.103846
Q("children_5")            0.916993
Q("bmiCat_Underweight")    0.406168
Q("children_3")            0.316833
Q("region_southwest")      0.293275
Q("children_4")            0.286980
Q("region_southeast")      0.197367
Q("ageCategory_30-39")     0.195310
dtype: float64

In [24]:
top_10_feature_indices = feature_importance_series.sort_values(ascending=False)[:10].index.values

In [25]:
inter_class_differences = model.feature_log_prob_[1] - model.feature_log_prob_[0]
new_feature_importance_series = Series(inter_class_differences, index=X.columns.values)

new_feature_importance_series.loc[top_10_feature_indices]

Q("smoker_yes")            5.737589
Q("ageCategory_60-65")     1.497852
Q("smoker_no")            -1.103846
Q("children_5")           -0.916993
Q("bmiCat_Underweight")   -0.406168
Q("children_3")            0.316833
Q("region_southwest")     -0.293275
Q("children_4")            0.286980
Q("region_southeast")      0.197367
Q("ageCategory_30-39")    -0.195310
dtype: float64