# Diabetes Prediction Project

## Importing The Libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

## Importing Dataset using Pandas Dataframe

In [4]:
diabetes_dataset = pd.read_csv('diabetes.csv')
diabetes_dataset.head() # printing the 5 first rows.

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# number of row and columns in this dataset 
diabetes_dataset.shape

(768, 9)

In [6]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
# checking is there any missing data or not
diabetes_dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## separating the diabetes patients
0 --> Non Diabetes <br/><br/>
1 --> Diabetes

In [29]:
# separating the diabetes patients
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [9]:
#separating the dataset into x and y
X = diabetes_dataset.iloc[:, :-1]
y = diabetes_dataset.iloc[:, -1]

In [10]:
print(X.shape)
print(y.shape)

(768, 8)
(768,)


## Splitting the datasets into Training sets and Test sets

In [11]:
X_Train,X_Test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

In [12]:
print(X.shape,X_Train.shape)

(768, 8) (614, 8)


## Feature Scaling / Data Standardization

In [30]:
sc = StandardScaler()
X_Train = sc.fit_transform(X_Train)

In [32]:
X_Test = sc.fit_transform(X_Test)

In [36]:
print(X_Train)

[[-1.13796489 -0.07971099 -3.5556072  ...  0.02825037 -0.98159708
  -0.7885233 ]
 [ 0.64067858 -0.52091877  0.02549599 ... -0.17184452 -1.03823795
   0.31879426]
 [-0.84152431  2.12632792 -0.48609018 ... -0.25938604 -0.21545477
   2.19271628]
 ...
 [ 2.12288146 -1.15121561  0.23013046 ... -0.25938604 -0.50760242
   0.14843771]
 [ 0.04779742 -0.30031488  0.43476492 ...  0.90366551 -0.69839272
   0.40397253]
 [-1.13796489 -1.11970076 -0.07682125 ...  0.45345201 -0.69243053
  -0.70334503]]


## Training the Support Vector Machine model on the Training set 

In [37]:
classifier = SVC(kernel='linear')
classifier.fit(X_Train,y_train)

## Accuracy prediction

In [38]:
final_predict = classifier.predict(X_Train)
training_data_accuracy = accuracy_score(final_predict,y_train)
print('training_data_accuracy :',round(training_data_accuracy*100,2))

training_data_accuracy : 78.66


## Making a Prediction Model

In [39]:
input_data = (2,90,68,42,0,38.2,0.503,27) # input data from user

input_data_as_numpy_array = np.asarray(input_data) # converting into numpy array

data_reshaped = input_data_as_numpy_array.reshape(1,-1) # reshaping it into 2d array

stand_data = sc.transform(data_reshaped) # standardized the input data

final_prediction = classifier.predict(stand_data)
print(final_prediction)

if final_prediction[0] == 0:
  print("The patient does't have diabetes")
else:
  print("The patient is diabetes")

[1]
The patient is diabetes
