In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
diabetes_dataset = pd.read_csv('diabetes.csv')

In [3]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
diabetes_dataset.shape

(768, 9)

In [5]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [7]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [8]:
#Seprate data and labels
x = diabetes_dataset.drop(columns=['Outcome'],axis=1)
y = diabetes_dataset['Outcome']

In [9]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [10]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [11]:
# diabetes_dataset.plot(kind='scatter',x='Outcome',y='Glucose')
# diabetes_dataset.plot(kind='scatter',x='Outcome',y='Insulin',color='red')
# diabetes_dataset.plot(kind='scatter',x='Outcome',y='BloodPressure',color='green')
# # diabetes_dataset.plot(kind='scatter',x='Outcome',y='Age',color='red')

In [12]:
type(diabetes_dataset)

pandas.core.frame.DataFrame

In [13]:
scaler = StandardScaler()

In [14]:
scaler.fit(x)

In [15]:
standardize_data = scaler.transform(x)

In [16]:
standardize_data[0:10]

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575,  0.53090156, -0.69289057,
        -0.68442195, -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, -1.28821221, -0.69289057,
        -1.10325546,  0.60439732, -0.10558415],
       [-0.84488505, -0.99820778, -0.16054575,  0.15453319,  0.12330164,
        -0.49404308, -0.92076261, -1.04154944],
       [-1.14185152,  0.5040552 , -1.50468724,  0.90726993,  0.76583594,
         1.4097456 ,  5.4849091 , -0.0204964 ],
       [ 0.3429808 , -0.15318486,  0.25303625, -1.28821221, -0.69289057,
        -0.81134119, -0.81807858, -0.27575966],
       [-0.25095213, -1.34247638, -0.98770975,  0.71908574,  0.07120427,
        -0.12597727, -0.676133  , -0.61611067],
       [ 1.82781311, -0.184482  , -3.57259724, -1.28821221, -0.69289057,
         0.41977549, -1.02042653, -0.36084741],
       [-0.54791859,  2.38188392

In [17]:
labels = diabetes_dataset.columns.values
standardize_df = pd.DataFrame(standardize_data,columns=labels[0:8])

In [18]:
standardize_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496


In [19]:
X_train,X_test,y_train,y_test = train_test_split(standardize_data,y,test_size=0.2,stratify=y,random_state=2)

In [20]:
print(X_train.shape,X_test.shape)

(614, 8) (154, 8)


# Support Vector Machine Classifier

In [21]:
classifier = svm.SVC(kernel='linear')

In [22]:
classifier.fit(X_train,y_train)

### Predict data on X training 

Accuracy Score

In [23]:
X_train_predict = classifier.predict(X_train)

In [24]:
X_train_predictions_accuracy = accuracy_score(X_train_predict,y_train)

In [25]:
print("Accuracy score for x train data is : ", X_train_predictions_accuracy)

Accuracy score for x train data is :  0.7866449511400652


### Predict data on X test 

In [26]:
X_test_predict = classifier.predict(X_test)
X_test_predictions_accuracy = accuracy_score(X_test_predict,y_test)
print("Accuracy score for x test data is : ", X_test_predictions_accuracy)

Accuracy score for x test data is :  0.7727272727272727


In [27]:
# print(X_test[70:80],y_test[70:80])
test = X_test[51]
out = y_test[51]
print(test,out)

[-0.25095213 -0.77912776 -0.67752325  0.84454186  0.30564246 -0.92556851
  1.4258696   0.49003012] 0


In [28]:
classifier.predict([test])

array([0], dtype=int64)

In [29]:
from sklearn.metrics import classification_report


In [30]:
print(classification_report(X_test_predict,y_test))

              precision    recall  f1-score   support

           0       0.91      0.78      0.84       117
           1       0.52      0.76      0.62        37

    accuracy                           0.77       154
   macro avg       0.71      0.77      0.73       154
weighted avg       0.82      0.77      0.79       154



In [31]:
from sklearn.metrics import confusion_matrix


In [32]:
print(confusion_matrix(X_test_predict,y_test))

[[91 26]
 [ 9 28]]


# KNeighbors Classifier

In [33]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
k_neighbors = KNeighborsClassifier()

In [35]:
k_neighbors.fit(X_train,y_train)

### Predict data on X training 

In [36]:
X_train_knn_predicts = k_neighbors.predict(X_train)

In [37]:
X_train_knn_predicts_accuracy = accuracy_score(X_train_knn_predicts,y_train)
print("Accuracy score for x test data is : ", X_train_knn_predicts_accuracy)

Accuracy score for x test data is :  0.8289902280130294


In [38]:
k_neighbors.predict([test])

array([0], dtype=int64)

In [39]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(X_train_knn_predicts,y_train))

[[362  67]
 [ 38 147]]


### Predict data on X test 

In [40]:
X_test_knn_predicts = k_neighbors.predict(X_test)
X_test_knn_predicts_accuracy = accuracy_score(X_test_knn_predicts,y_test)
print("Accuracy score for x test data is : ", X_test_knn_predicts_accuracy)

Accuracy score for x test data is :  0.7207792207792207


In [41]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(X_test_knn_predicts,y_test))

[[362  67]
 [ 38 147]]
