## Importing dependencies

In [23]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import  train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score

## Data collection and Analysis

PIMA Diabetes Dataset

In [2]:
def load_data(path):
  return pd.read_csv(path)

In [5]:
# loading data with pandas
diabetes_dataset = load_data('diabetes.csv')
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# number of rows and columns in the dataset
diabetes_dataset.shape

(768, 9)

In [7]:
# getting statistical measures of data
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
diabetes_dataset.Outcome.value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


0 --> Non diabetic

1 --> Diabetic

In [9]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [10]:
# Separating features and labels
X = diabetes_dataset.drop(axis=1, columns='Outcome')
y = diabetes_dataset.Outcome

In [11]:
X.shape, y.shape

((768, 8), (768,))

## Data Standardization

In [12]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

## Data Splitting

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [18]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((614, 8), (614,), (154, 8), (154,))

## Training models

In [28]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [24]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

## Validation model

In [21]:
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [29]:
val_score = accuracy_score(svm_model.predict(X_valid), y_valid)
train_score = accuracy_score(svm_model.predict(X_train), y_train)
print(f'Validation score: {val_score}')
print(f'Train score: {train_score}')

Validation score: 0.7272727272727273
Train score: 0.8289902280130294


In [30]:
val_score = accuracy_score(log_reg_model.predict(X_valid), y_valid)
train_score = accuracy_score(log_reg_model.predict(X_train), y_train)
print(f'Validation score: {val_score}')
print(f'Train score: {train_score}')

Validation score: 0.7272727272727273
Train score: 0.7703583061889251


## Model fine tunning

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

grid_search = GridSearchCV(log_reg_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [33]:
print(grid_search.best_params_)

{'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}


In [34]:
model = grid_search.best_estimator_

## Model assessment

In [35]:
train_score = accuracy_score(model.predict(X_train), y_train)
test_score = accuracy_score(model.predict(X_test), y_test)
print(f'Train score: {train_score}')
print(f'Test score: {test_score}')

Train score: 0.7671009771986971
Test score: 0.7922077922077922


## Prediction

In [36]:
def predict_diabetes(input_data):
  input_data = np.asarray(input_data)
  input_data = input_data.reshape(1, -1)
  input_data = scaler.transform(input_data)
  prediction = model.predict(input_data)

  if prediction[0] == 0:
    print("The person is not diabetic")
  else:
    print("The person is diabetic")

In [38]:
data = [6, 148, 72, 35, 0, 33.6, 0.627, 50]
predict_diabetes(data)

The person is diabetic


