<a href="https://colab.research.google.com/github/utkarsh-dtu/Algorithms/blob/master/Diabetes_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection

In [4]:
diabetes_dataset = pd.read_csv('diabetes.csv')

In [None]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes_dataset.shape

(768, 9)

In [5]:
# statistical measures
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
diabetes_dataset['Outcome'].value_counts()
# to check how many are diabetic and how many are non diabetic

0    500
1    268
Name: Outcome, dtype: int64

0 --- > NON DIABETIC
1 ---> DIABETIC

In [7]:
diabetes_dataset.groupby('Outcome').mean()
# gives means of labels

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [8]:
# separate X and Y (axis = 1, to drop a column and 0 to drop a row)
X = diabetes_dataset.drop(columns = 'Outcome', axis = 1)
Y = diabetes_dataset['Outcome']

Standardizing Data (bring all the values in the range 0-1) (in order to prevent a feature having large values from overshadowing other features)


In [9]:
scaler = StandardScaler()


In [10]:
scaler.fit(X)
# we want to scale the values of the feautures to a common, scale 
# so fit will calculate the mean and variance of the given dataset
# and the transform function will transform it into the actual scaled values

# on training data we have to use these two functions one after the other
# scaler.fit(X)
# scaler.transform(X)
# # so fit will calculate the mean and variance of the given dataset and the transform function will transform it into the actual scaled values # #

# instead of calling fit and transform one after the other, we can use the function scaler.fit_transform()


StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
standardized_data = scaler.transform(X)

In [12]:
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [13]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [14]:
print(X)



[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [15]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state = 2)
# 0.2 means 20% of the data is test data
# stratify - for properly split Y, we dont want all the diabetic ones to go to one of training or testing data

In [17]:
print(X.shape, x_train.shape, x_test.shape)

(768, 8) (614, 8) (154, 8)


In [18]:
classifier = svm.SVC(kernel = 'linear')

In [19]:
classifier.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Model Evaluation

Accuracy Score

In [20]:
x_train_predictions = classifier.predict(x_train)
training_data_accuracy = accuracy_score(x_train_predictions, y_train)

In [21]:
print("Accuracy Score of training data : ", training_data_accuracy)

Accuracy Score of training data :  0.7866449511400652


In [22]:
x_test_predictions = classifier.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_predictions, y_test)

In [None]:
print("Accuracy Score of test data: ", testing_data_accuracy)

Accuracy Score of test data:  0.7727272727272727


Build a predictive System

In [28]:
# input_data = [4,110,92,0,0,37.6,0.191,30]
input_data = [5,166,72,19,175,25.8,0.587,51]
input_data_as_np_array = np.asarray(input_data) # change data into numpy array
input_data_reshaped = input_data_as_np_array.reshape(1, -1) # we are giving only one instance of the data

# now we have the input data, and we want to scale the values of these inputs as well, so we use the same variance and mean that we used for scaling the training data
# and these variance and mean are contained in the instance of the StandardScaler class which we created earlier, so we now have to only transform this
# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
# print(prediction)
if prediction[0] == 1:
  print("The person is diabetic")
else :
  print("The person is non diabetic")

[[ 0.3429808   1.41167241  0.14964075 -0.09637905  0.82661621 -0.78595734
   0.34768723  1.51108316]]
The person is diabetic
