In [80]:
#Import Library
from time import time
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import pandas as pd
import numpy as np
import scipy as sp

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [81]:
data = pd.read_csv('ZenHealthAppEngine/dataset/user_classifier.csv')
Y = data['label']
data.head()


Unnamed: 0,id,user,bloodglucoselevel,bmi,gender,sugarcomsumed,label
0,1,user1,126,29.0,male,8.0,1
1,2,user2,127,30.0,female,8.5,1
2,3,user3,130,31.0,male,9.0,1
3,4,user4,140,32.0,female,9.5,1
4,5,user5,160,33.0,male,10.0,1


In [82]:
#preprocessing
X = pd.DataFrame()

X_dec = data['gender']
X_dec_user = data['user']

le = preprocessing.LabelEncoder()
X_enc = le.fit_transform(data['gender'])
print(X_enc)
X['gender'] = X_enc
X['bloodglucoselevel'] = data['bloodglucoselevel']
X['bmi'] = data['bmi']
X['sugarcomsumed'] = data['sugarcomsumed']
X['label'] = data['label']
#X['user'] = X_enc_user

X.head(3)

[1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
 1 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1]


Unnamed: 0,gender,bloodglucoselevel,bmi,sugarcomsumed,label
0,1,126,29.0,8.0,1
1,0,127,30.0,8.5,1
2,1,130,31.0,9.0,1


In [83]:
print(X.shape)
print(Y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=99)

print("X_train total length:",len(X_train))
print("X_test total length:",len(X_test))
print("Y_train total length:",len(y_train))

(100, 5)
(100,)
X_train total length: 70
X_test total length: 30
Y_train total length: 70


In [84]:
model_nb = GaussianNB()
model_svm = svm.SVC()
model_lr = LogisticRegression()
model_knn_centroid = NearestCentroid()
model_knn = KNeighborsClassifier()
eclf = VotingClassifier(estimators=[('lr', model_lr), ('knn_centroid', model_knn_centroid), ('gnb', model_nb), ('svc', model_svm), ('knn', model_knn)],
voting='hard', weights=[1,1,1,1,1])

models = [ model_nb, model_svm, model_lr, model_knn_centroid, model_knn, eclf]
model_names = [ "Naive Bayes", "SVM", "Logistic Regression", "Nearest Neighbors using Centroid", "K-nearest Neighbors", "Ensemble"]
    
best_model = None
best_accuracy = 0
best_preds = None

In [85]:
print("Performance of models")
print("======================")
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    rmse_nb = mean_squared_error(y_test, preds)
    print("Name:", name)
    print("Accuracy score: ", accuracy)
    print("RMSE: ", rmse_nb) 
    if accuracy >= best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_preds = preds
        
        
        
print("======================")
print("Best model:",best_model)
print("Best accuracy:",best_accuracy)
print("Best predictions:",best_preds)  

Performance of models
Name: Naive Bayes
Accuracy score:  1.0
RMSE:  0.0
Name: SVM
Accuracy score:  0.833333333333
RMSE:  0.266666666667
Name: Logistic Regression
Accuracy score:  0.9
RMSE:  0.1
Name: Nearest Neighbors using Centroid
Accuracy score:  1.0
RMSE:  0.0
Name: K-nearest Neighbors
Accuracy score:  1.0
RMSE:  0.0
Name: Ensemble
Accuracy score:  1.0
RMSE:  0.0
Best model: VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('knn_centroid...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         n_jobs=1, voting='hard', weights=[1, 1, 1, 1, 1])
Best accuracy: 1.0
Best predictions: [1 2 2 1 1 1 1 2 1 1 1 3 1 1 2 2 3 3 2 3 2 1 2 3 2 2 2 1 1 3]


In [86]:
results = X_train
X_test = X_test.drop('label',1)
X_test['label'] = best_preds
results.append(X_test)
results.head()

Unnamed: 0,gender,bloodglucoselevel,bmi,sugarcomsumed,label
8,1,130,38.0,10.2,1
28,1,105,29.0,6.0,2
18,1,130,39.0,9.5,1
67,0,97,18.0,0.5,3
83,0,210,36.0,9.3,1


In [98]:

results = results.drop('gender',1)
#results = results.drop('user',1)

results['gender'] = X_dec
#results['user'] = X_dec_user

results.head()

Unnamed: 0,bloodglucoselevel,bmi,sugarcomsumed,label,gender
8,130,38.0,10.2,1,male
28,105,29.0,6.0,2,male
18,130,39.0,9.5,1,male
67,97,18.0,0.5,3,female
83,210,36.0,9.3,1,female


In [99]:
#predict for the new user 

from pandas.io.json import json_normalize
df = pd.DataFrame.from_dict(json_normalize({ 'bloodglucodelevel':180, 'bmi' :28 , 
                                            'sugarcomsumed' :10, 'gender':0,  'label': 0}), orient='columns')


predicted = best_model.predict(df)

print(predicted)

[3]


In [94]:
import pickle
s = pickle.dumps(best_model)

In [95]:
from sklearn.externals import joblib
joblib.dump(best_model, 'ZenHealthAppEngine/models/user_classifier.pkl')

['ZenHealthAppEngine/models/user_classifier.pkl']

In [96]:
best_model_pkl = joblib.load('ZenHealthAppEngine/models/user_classifier.pkl') 

In [97]:
predicted = best_model_pkl.predict(df)

print(predicted)

[3]
