## Insurance Claim dataset: Model Creation
***
- In this notebook (second of two parts), we build a predictive model on the insurance claim data. 
- We train several models: Logistic Regression, Random Forest and Artifical Neural network. 
- We test the models on some holdout data for accuracy and other evaluation metrics.
- We save these models to the disk for later use.
- We also show how a saved and trained models could be loaded and used with new data.

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from  sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from keras.models import model_from_json
from sklearn.grid_search import GridSearchCV
import pickle
from sklearn.externals import joblib

from keras.models import Sequential
from keras.layers import Dense, Dropout

from sklearn.decomposition import PCA 
from fancyimpute import KNN

In [49]:
! head -n 2 ./datasets/train2.csv

x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x36,x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49,x50,x51,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61,x62,x63,x64,x65,x66,x67,x69,x70,x71,x72,x73,x74,x75,x76,x77,x78,x79,x80,x81,x82,x83,x84,x85,x86,x87,x88,x89,x90,x91,x92,x94,x95,x96,x97,x98,x99,brand_bmw,brand_chevrolet,brand_chrystler,brand_ford,brand_honda,brand_mercades,brand_nissan,brand_tesla,brand_toyota,brand_volkswagon,day_fri,day_mon,day_thur,day_tues,day_wed,month_apr,month_aug,month_dec,month_feb,month_jan,month_july,month_jun,month_mar,month_may,month_nov,month_oct,month_sept,country_america,country_asia,country_euorpe,y
0.198559608853,74.425319549,67.6277449069,-3.09511140802,-6.82232689348,19.0480707858,-0.362378194917,-10.6991738418,-22.6997911893,-1.56126206373,1.14861768226,-3.04335060595,-3.89655764707,2.17061450755,6.36629833434,-7.34086634511,0.267735423569,1.08131144748,-15.2368850395,1.

In [3]:
df_train= pd.read_csv("./datasets/train2.csv", header=0)

In [4]:
df_train.shape

(39965, 127)

In [4]:
# The whole data_matrix
data_mat = df_train.values
data_mat.shape

(39965, 127)

## Preparing the data

In [5]:
# Split the data for train and dev/test purpose 
X_train, X_test, Y_train, Y_test = train_test_split(data_mat[:,0:126], data_mat[:,126],test_size=0.10, random_state=10)

In [7]:
X_train.shape

(35968, 126)

In [8]:
sum(Y_train[1:10000])

2099.0

## Standardization of data

In [9]:
#Normalizer or Standardized

scaler = StandardScaler()
scaler.fit(X_train)
Xn_train = scaler.transform(X_train)
Xn_test = scaler.transform(X_test)



## Logistic Regression

In [8]:
# Logistic Regression: Look at the parameters list
### Try different parameters
log_reg = LogisticRegression(tol=0.0000001, C=10, max_iter=1000000)
log_reg.fit(Xn_train, Y_train)
Y_pred = log_reg.predict(Xn_test)
accuracy = (1 - np.sum(np.abs(Y_pred-Y_test))/Y_test.shape[0])*100
print(accuracy)

88.2912184138


In [13]:
# GridSearchCV

#Parameter space
parameters = {'C':[0.01, 0.1, 1, 10]}
#Estimator
lr = LogisticRegression(tol=0.0001, max_iter=10000)
# Method of Search, scoring, and Cross-validation strategy
clf = GridSearchCV(lr, parameters, cv = 5, scoring=None)

# Train the estimator through all the point on the grid
clf.fit(Xn_train, Y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10]}, pre_dispatch='2*n_jobs',
       refit=True, scoring=None, verbose=0)

In [15]:
print(clf.best_params_)
print(clf.best_score_)

{'C': 0.01}


0.8912366548042705

## Random Forest

In [16]:
randF_clf = RandomForestClassifier(n_estimators=100, random_state=111, max_features=40)
randF_clf.fit(Xn_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=40, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=111, verbose=0, warm_start=False)

In [19]:
randF_clf.score(Xn_test, Y_test)

0.9034275706780085

In [20]:
Y_pred = randF_clf.predict(Xn_test)
print classification_report(Y_test, Y_pred)
accuracy = (1 - np.sum(np.abs(Y_pred-Y_test))/Y_test.shape[0])*100
print("Accuracy: ", accuracy)

             precision    recall  f1-score   support

        0.0       0.89      1.00      0.94      3172
        1.0       0.98      0.54      0.70       825

avg / total       0.91      0.90      0.89      3997

('Accuracy: ', 90.342757067800846)


### Artificial Neural Network using Keras
***
We create a neural networks with three hidden layers having 32, 16, 8 nodes and 1 output node. We implement dropouts in each of these layers for regularization.

In [54]:
# Neural Network using Keras
def model_ann(input_nodes):
    '''
    This function creates artifical neural network with fixed number of layers and nodes.
    ---
    Parameters:
    ----
    input_nodes: Number of input nodes in the network
    ----
    Returns: A sequential neural network model
    '''
    model = Sequential()
    
    # First Hidden Layer
    model.add(Dense(32, input_dim=input_nodes, activation='relu'))
    model.add(Dropout(0.25))
    
    # Second Hidden Layer
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.25))
    
    # Third Hidden Layer
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.25))
    
    # Output Layer
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [55]:
model_ann = model_ann(Xn_train.shape[1])
print model_ann.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 32)                4064      
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 8)                 136       
_________________________________________________________________
dropout_9 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 9         
Total para

In [56]:
model_ann.fit(Xn_train, Y_train,
          epochs=50,
          batch_size=128)
score = model_ann.evaluate(Xn_test, Y_test, batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

In [57]:
print "Accuracy: ", score

Y_pred = model_ann.predict_classes(Xn_test)

print "\n The Classification Report: \n", classification_report(Y_test, Y_pred)

Accuracy:  [0.072297399163186504, 0.98423817863397545]
 The Classification Report: 
             precision    recall  f1-score   support

        0.0       0.99      0.99      0.99      3172
        1.0       0.98      0.95      0.96       825

avg / total       0.98      0.98      0.98      3997



# Save the model

In [34]:
#Save the scaler
pickle.dump(scaler, open("./datasets/Scaler.pkl", 'wb'))


# SAVE PCA on disk
#pickle.dump(skl_pca, open("./data/SKL_PCA_120517.pkl", 'wb'))

In [36]:
# SAVE THE SVM MODEL
pickle.dump(randF_clf, open("./datasets/RandomForest1.pkl", 'wb'))

In [37]:
# serialize model to JSON and save to disk
model_ann_json = model_ann.to_json()
with open("./datasets/keras_mode.json", "wb") as json_fl:
    json_fl.write(model_ann_json)
    
# Also serialize weights to HDF5 and save
model_ann.save_weights("./datasets/keras_model_weights.h5")
print("Successfully saved the model to disk")

Successfully saved the model to disk


### Probability prediction after Loding the model from disk

In [39]:
## Load test data taht has no labels
df_test = pd.read_csv("./datasets/test2.csv")

In [40]:
df_test.shape

(10000, 126)

In [42]:
#Normalizer or Standardized or Binarization
scaler_loaded = pickle.load(open("./datasets/Scaler.pkl", 'rb'))

In [44]:
#Normalize the test data
Xn_test = scaler_loaded.transform(X_test)

In [45]:
# Load trained RandomForestClassifier
randF_loaded = pickle.load(open("./datasets/RandomForest1.pkl", 'rb'))

In [46]:
#Check that loaded model is working fine
Y_pred2 = randF_loaded.predict(Xn_test)
print("Accuracy: ", randF_loaded.score(Xn_test, Y_test))

('Accuracy: ', 0.9034275706780085)


In [49]:
# LOAD ANN: json and create model
with open('./datasets/keras_mode.json', 'rb') as json_file:
    loaded_model_json = json_file.read()

loaded_model_ann = model_from_json(loaded_model_json)

# load weights into new model
loaded_model_ann.load_weights("./datasets/keras_model_weights.h5")
print("Loaded model from disk")

Loaded model from disk


In [50]:
 # MAKE SURE that loaded ANN is working fine
loaded_model_ann.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
score = loaded_model_ann.evaluate(Xn_test, Y_test, verbose=0, batch_size=128)
print("Accuracy: %.2f%%" % ( score[1]*100))

Accuracy: 98.32%
