# Evaluate the Diabetes Dataset
### Note: Credits of this work goes to simplilearn, you may check their site:
https://www.simplilearn.com/

#### 1: Import the dataset

In [4]:
#Import the required libraries
import pandas as pd

In [5]:
#Import the diabetes dataset
df = pd.read_csv('pima-indians-diabetes.data', header = None)

#### 2: Analyze the dataset

In [6]:
#View the first five observations of the dataset
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### 3: Find the features of the dataset

In [7]:
#Use the .NAMES file to view and set the features of the dataset
feature_names = ['Pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

In [8]:
#Use the feature names set earlier and fix it as the column headers of the dataset
df = pd.read_csv('pima-indians-diabetes.data', header = None, names = feature_names)

In [9]:
#Verify if the dataset is updated with the new headers
df.head()

Unnamed: 0,Pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
#View the number of observations and features of the dataset
df.shape

(768, 9)

#### 4: Find the response  of the dataset

In [11]:
#Select features from the dataset to create the model
feature_select_cols = ['Pregnant', 'insulin', 'bmi', 'age']

In [12]:
#Create the feature object
X = df[feature_select_cols]

In [13]:
#Create the reponse object
y = df['label']

In [14]:
#View the shape of the feature object
X.shape

(768, 4)

In [15]:
#View the shape of the target object
y.shape

(768,)

#### 5: Use training and testing datasets to train the model

In [16]:
#Split the dataset to test and train the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

#### 6: Create a model  to predict the diabetes outcome

In [17]:
# Create a logistic regression model using the training set
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_train, y_train)

LogisticRegression()

In [18]:
#Make predictions using the testing set
y_pred = logReg.predict(X_test)

#### 7: Check the accuracy of the model

In [19]:
#Evaluate the accuracy of your model
from sklearn import metrics
print('Accuracy of the Model is:', metrics.accuracy_score(y_test, y_pred))

Accuracy of the Model is: 0.6927083333333334


In [20]:
#Print the first 30 actual and predicted responses
print('Actual values:', y_test.values[0:30])
print('Predicted values:', y_pred[0:30])

Actual values: [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1]
Predicted values: [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]


In [21]:
import pickle as pkl

In [22]:
persist_model = pkl.dumps(logReg)
persist_model

b'\x80\x04\x95\xda\x02\x00\x00\x00\x00\x00\x00\x8c\x1esklearn.linear_model._logistic\x94\x8c\x12LogisticRegression\x94\x93\x94)\x81\x94}\x94(\x8c\x07penalty\x94\x8c\x02l2\x94\x8c\x04dual\x94\x89\x8c\x03tol\x94G?\x1a6\xe2\xeb\x1cC-\x8c\x01C\x94G?\xf0\x00\x00\x00\x00\x00\x00\x8c\rfit_intercept\x94\x88\x8c\x11intercept_scaling\x94K\x01\x8c\x0cclass_weight\x94N\x8c\x0crandom_state\x94N\x8c\x06solver\x94\x8c\x05lbfgs\x94\x8c\x08max_iter\x94Kd\x8c\x0bmulti_class\x94\x8c\x04auto\x94\x8c\x07verbose\x94K\x00\x8c\nwarm_start\x94\x89\x8c\x06n_jobs\x94N\x8c\x08l1_ratio\x94N\x8c\x0en_features_in_\x94K\x04\x8c\x08classes_\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K\x02\x85\x94h\x1c\x8c\x05dtype\x94\x93\x94\x8c\x02i8\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C\x10\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x94t\x94b\x

In [26]:
# to persist the model to a file
import joblib
joblib.dump(logReg, 'regresfilename.pkl')

['regresfilename.pkl']

In [27]:
# to load the model 
new_logReg_estimator = joblib.load('regresfilename.pkl')

In [29]:
y_pred = new_logReg_estimator.predict(X_test)

In [30]:
#Evaluate the accuracy of your model
from sklearn import metrics
print('Accuracy of the Model is:', metrics.accuracy_score(y_test, y_pred))

Accuracy of the Model is: 0.6927083333333334


In [31]:
#Print the first 30 actual and predicted responses
print('Actual values:', y_test.values[0:30])
print('Predicted values:', y_pred[0:30])

Actual values: [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1]
Predicted values: [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]


### Thanks for stopping by <3