In [1]:
#Import Library
from time import time
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import pandas as pd
import numpy as np
import scipy as sp

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
X = pd.read_csv('ZenHealthAppEngine/dataset/user_classifier.csv')
Y = X['label']
X.head()


Unnamed: 0,id,user,bloodglucoselevel,bmi,gender,sugarcomsumed,label
0,1,user1,9.0,29.0,male,8.0,1
1,2,user2,9.1,30.0,female,8.5,1
2,3,user3,9.6,31.0,male,9.0,1
3,4,user4,10.0,32.0,female,9.5,1
4,5,user5,11.0,33.0,male,10.0,1


In [3]:
#preprocessing
X_dec = X['gender']
X_dec_user = X['user']

le = preprocessing.LabelEncoder()
X_enc = le.fit_transform(X['gender'])
X_enc_user = le.fit_transform(X['user'])

X = X.drop('gender',1)
X = X.drop('user',1)


X['gender'] = X_enc
X['user'] = X_enc_user

X.head(3)

Unnamed: 0,id,bloodglucoselevel,bmi,sugarcomsumed,label,gender,user
0,1,9.0,29.0,8.0,1,1,0
1,2,9.1,30.0,8.5,1,0,11
2,3,9.6,31.0,9.0,1,1,22


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=99)

print("X_train total length:",len(X_train))
print("X_test total length:",len(X_test))
print("Y_train total length:",len(y_train))

X_train total length: 52
X_test total length: 23
Y_train total length: 52


In [5]:
model_nb = GaussianNB()
model_svm = svm.SVC()
model_lr = LogisticRegression()
model_knn_centroid = NearestCentroid()
model_knn = KNeighborsClassifier()
eclf = VotingClassifier(estimators=[('lr', model_lr), ('knn_centroid', model_knn_centroid), ('gnb', model_nb), ('svc', model_svm), ('knn', model_knn)],
voting='hard', weights=[1,1,1,1,1])

models = [ model_nb, model_svm, model_lr, model_knn_centroid, model_knn, eclf]
model_names = [ "Naive Bayes", "SVM", "Logistic Regression", "Nearest Neighbors using Centroid", "K-nearest Neighbors", "Ensemble"]
    
best_model = None
best_accuracy = 0
best_preds = None

In [6]:
print("Performance of models")
print("======================")
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    rmse_nb = mean_squared_error(y_test, preds)
    print("Name:", name)
    print("Accuracy score: ", accuracy)
    print("RMSE: ", rmse_nb) 
    if accuracy >= best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_preds = preds
        
        
        
print("======================")
print("Best model:",best_model)
print("Best accuracy:",best_accuracy)
print("Best predictions:",best_preds)  

Performance of models
Name: Naive Bayes
Accuracy score:  1.0
RMSE:  0.0
Name: SVM
Accuracy score:  0.95652173913
RMSE:  0.0434782608696
Name: Logistic Regression
Accuracy score:  1.0
RMSE:  0.0
Name: Nearest Neighbors using Centroid
Accuracy score:  0.869565217391
RMSE:  0.130434782609
Name: K-nearest Neighbors
Accuracy score:  1.0
RMSE:  0.0
Name: Ensemble
Accuracy score:  1.0
RMSE:  0.0
Best model: VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('knn_centroid...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         n_jobs=1, voting='hard', weights=[1, 1, 1, 1, 1])
Best accuracy: 1.0
Best predictions: [2 2 1 1 3 1 2 3 1 2 3 3 1 2 1 3 2 3 2 3 1 3 1]


In [7]:
results = X_train
X_test = X_test.drop('label',1)
X_test['label'] = best_preds
results.append(X_test)
results.head()

Unnamed: 0,id,bloodglucoselevel,bmi,sugarcomsumed,label,gender,user
30,31,7.1,25.0,8.0,2,1,24
31,32,7.5,25.0,2.3,2,0,25
42,43,6.9,27.0,6.6,2,1,37
72,73,5.2,20.1,0.3,3,1,70
60,61,3.1,22.5,1.5,3,1,57


In [8]:

results = results.drop('gender',1)
results = results.drop('user',1)

results['gender'] = X_dec
results['user'] = X_dec_user

results.head()

Unnamed: 0,id,bloodglucoselevel,bmi,sugarcomsumed,label,gender,user
30,31,7.1,25.0,8.0,2,male,user31
31,32,7.5,25.0,2.3,2,female,user32
42,43,6.9,27.0,6.6,2,male,user43
72,73,5.2,20.1,0.3,3,male,user73
60,61,3.1,22.5,1.5,3,male,user61


In [9]:
#predict for the new user 

from pandas.io.json import json_normalize
df = pd.DataFrame.from_dict(json_normalize({'id':23 , 'bloodglucodelevel':2.2 , 'bmi' :27 , 
                                            'sugarcomsumed' :2.1, 'gender':'male', 'user':'appo' , 'label': 0}), orient='columns')

df_enc = le.fit_transform(df['gender'])
df_enc_user = le.fit_transform(df['user'])
df = df.drop('gender',1)
df = df.drop('user',1)
df['gender'] = df_enc
df['user'] =df_enc_user

predicted = best_model.predict(df)

print(predicted)

[1]


In [10]:
import pickle
s = pickle.dumps(best_model)

In [11]:
from sklearn.externals import joblib
joblib.dump(best_model, 'ZenHealthAppEngine/dataset/user_classifier.pkl')

['user_classifier.pkl']

In [12]:
best_model_pkl = joblib.load('user_classifier.pkl') 

In [13]:
predicted = best_model_pkl.predict(df)

print(predicted)

[1]
