#### 1: Import the dataset

In [2]:
#Import the required libraries
#The given dataset lists the glucose level readings of several pregnant women taken either during a survey examination or routine medical care. It specifies if the 2-hour post-load plasma glucose was at least 200 mg/dl. Analyze the dataset to:
# Find the features of the dataset,
# Find the response label of the dataset,
# Create a model  to predict the diabetes outcome,
# Use training and testing datasets to train the model, and
# Check the accuracy of the model.

import numpy as np
import pandas as pd

In [3]:
#Import the diabetes dataset
csv_diab = pd.read_csv('I:\DataScience\pima-indians-diabetes.data', header=None)
# set header to 0 as the dataset does not have any header

#### 2: Analyze the dataset

In [4]:
#View the first five observations of the dataset
csv_diab.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### 3: Find the features of the dataset

In [8]:
#Use the .NAMES file to view and set the features of the dataset
#    1. Number of times pregnant
#    2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
#    3. Diastolic blood pressure (mm Hg)
#    4. Triceps skin fold thickness (mm)
#    5. 2-Hour serum insulin (mu U/ml)
#    6. Body mass index (weight in kg/(height in m)^2)
#    7. Diabetes pedigree function
#    8. Age (years)
#    9. Class variable (0 or 1)
feature_names = ['Pregnant', 'glucose', 'bp', 'skip', 'insulin', 'bmi', 'pedigree', 'age', 'label']

In [9]:
#Use the feature names set earlier and fix it as the column headers of the dataset
csv_diab = pd.read_csv('I:\DataScience\pima-indians-diabetes.data', header=None, names=feature_names)

In [10]:
#Verify if the dataset is updated with the new headers
csv_diab.head()

Unnamed: 0,Pregnant,glucose,bp,skip,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
#View the number of observations and features of the dataset
csv_diab.shape # 768 observations 9 features

(768, 9)

#### 4: Find the response  of the dataset

In [12]:
#Select features from the dataset to create the model
#Create the feature object
X_feature = csv_diab[['Pregnant','insulin','bmi','age']]
#Create the reponse object
Y_target =  csv_diab['label']

In [13]:
#View the shape of the feature object
X_feature.shape

(768, 4)

In [14]:
#View the shape of the target object
Y_target.shape

(768,)

#### 5: Use training and testing datasets to train the model

In [17]:
#Split the dataset to test and train the model
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_feature, Y_target, random_state=1)
# The class 'sklearn.linear_model.LogisticRegression, random_state' indicates the seed of the pseudo random number generator used to shuffle data.
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(576, 4)
(192, 4)
(576,)
(192,)


#### 6: Create a model  to predict the diabetes outcome

In [18]:
# Create a logistic regression model using the training set
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression() # instansiate the estimator
logReg.fit(x_train, y_train)

LogisticRegression()

In [20]:
#Make predictions using the testing set
# create a predictor object and fit the training dataset into it
y_pred = logReg.predict(x_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0], dtype=int64)

#### 7: Check the accuracy of the model

In [22]:
#Evaluate the accuracy of your model
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred)) # closed the metric to one higher the accuracy

0.6927083333333334


In [24]:
#Print the first 30 actual and predicted responses
print('Actual: ', y_test.values[0:30])
print('Predicted: ', y_pred[0:30] )

Actual:  [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1]
Predicted:  [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]


In [None]:
# With higher SSR or SSE, the prediction will be less accurate and the model will not be the best fit for the attributes.