In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.impute import SimpleImputer # For Handling Missing Values
from sklearn.preprocessing import StandardScaler # For Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # For Ordinal Encoding

## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import joblib

In [2]:
os.chdir('../../')
os.getcwd()

'/home/utpal108/dev/Upwork/Projects/Diabetic-Retinopathy-Prediction'

In [3]:
df = pd.read_csv('artifacts/data/diabetes.csv')
df.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [4]:
# number of rows and Columns in this dataset
df.shape

(768, 9)

In [5]:
# getting the statistical measures of the data
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
# Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
# Check Null Values
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [9]:
# Split into independent and dependent features
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [10]:
X.shape

(768, 8)

In [11]:
y.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [12]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X,y = oversample.fit_resample(X, y)

In [13]:
y.value_counts()

Outcome
1    500
0    500
Name: count, dtype: int64

In [14]:
# Segregating Numerical and Categorical features
numerical_features = [feature for feature in X.columns if X[feature].dtypes !=object]
categorical_features = [feature for feature in X.columns if X[feature].dtypes ==object]

In [15]:
numerical_features

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [16]:
categorical_features

[]

In [17]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)


In [18]:
# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder()),
        ('scaler', StandardScaler())
    ]
)


In [19]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_features),
    ('cat_pipeline', cat_pipeline, categorical_features)
])

In [20]:
# Train, Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42,shuffle=True)


In [21]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [22]:
joblib.dump(preprocessor, 'artifacts/training/preprocessor.pkl')

['artifacts/training/preprocessor.pkl']

In [23]:
X_train

array([[-2.99452618e-01,  3.25938878e-02,  8.14204769e-02, ...,
        -6.08258570e-02,  1.37000720e-01, -6.12686698e-01],
       [-1.21215291e+00,  1.82148722e+00,  1.92926109e+00, ...,
         2.02937802e-01, -2.05181108e-01,  6.27877277e-01],
       [ 4.78081103e-03, -1.22842928e+00, -7.27009790e-01, ...,
        -6.99411558e-01, -5.92386860e-01, -5.24074986e-01],
       ...,
       [ 9.17481099e-01,  1.11766033e+00,  2.36754578e-02, ...,
         8.05624918e-04, -1.08379135e+00,  8.05100703e-01],
       [-1.21215291e+00,  4.13833450e-01, -4.07622089e+00, ...,
         1.32740393e+00, -8.95547953e-01, -4.35463273e-01],
       [-1.21215291e+00, -5.53844726e-02,  1.46730093e+00, ...,
        -1.43517334e+00, -7.24457039e-01, -1.14435697e+00]])

In [24]:
X_test

array([[-0.3071943 , -0.03155595,  0.5631035 , ...,  0.10936736,
        -0.5297319 , -0.68442989],
       [ 1.12383503, -1.95093872,  0.23088905, ..., -0.03693276,
         0.31855537,  0.7346904 ],
       [ 1.98245263, -0.1616836 ,  0.5631035 , ...,  1.21880989,
         0.85053213,  1.26686051],
       ...,
       [-1.16581189,  0.32629507,  1.1444788 , ...,  1.0115514 ,
        -0.59011845, -0.68442989],
       [ 1.69624676, -0.32434316, -2.75904105, ...,  0.36539256,
        -1.02145096, -0.41834484],
       [ 0.83762917,  0.87933756,  0.89531796, ...,  2.15756896,
        -0.4377143 ,  0.20252029]])

In [25]:
# Model Training
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [26]:
# Train Multiple Model
models = {
    'SVC':SVC(kernel='linear', gamma='scale'),
    'DecisionTree':DecisionTreeClassifier(),
    'RandomForest':RandomForestClassifier(criterion='entropy', max_features='sqrt'),
    'GradientBoosting':GradientBoostingClassifier(criterion='squared_error', loss='exponential'),
    'KNeighbors':KNeighborsClassifier(algorithm='auto',n_neighbors=9, weights='distance')
}

In [27]:
def evaluate_model(models, X_train, X_test, y_train, y_test):
    try:
        report = {}
        best_model = {'': -np.inf}

        # Evaluate the models base on the accuracy_score scores
        for i in range(len(models)):
            model_name = list(models.keys())[i]
            model = list(models.values())[i]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            if list(best_model.values())[0] < score:
                best_model = {model_name: score}

            report[model_name] = score

        return report, best_model

    except Exception as e:
        pass

In [28]:
model_report, best_model = evaluate_model(models, X_train, X_test, y_train, y_test)
best_model = models[list(best_model.keys())[0]]

In [29]:
model_report

{'SVC': 0.74,
 'DecisionTree': 0.73,
 'RandomForest': 0.8066666666666666,
 'GradientBoosting': 0.77,
 'KNeighbors': 0.78}

In [30]:
joblib.dump(best_model, 'artifacts/training/diabetes_prediction_model.pkl')

['artifacts/training/diabetes_prediction_model.pkl']

In [31]:
model = joblib.load('artifacts/training/diabetes_prediction_model.pkl')


In [32]:
model

In [33]:
input_data = {
    'Pregnancies':[1],
    'Glucose':[85],
    'BloodPressure':[66],
    'SkinThickness':[29],
    'Insulin':[0],
    'BMI':[26.6],
    'DiabetesPedigreeFunction':[0.351],
    'Age':[31]
}

input_data = pd.DataFrame(input_data)
input_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,85,66,29,0,26.6,0.351,31


In [34]:
# Scale the Features and Predict
preprocessor = joblib.load('artifacts/training/preprocessor.pkl')
features_scale = preprocessor.transform(input_data)
predict = model.predict(features_scale)
result = round(predict[0], 2)
result

0

In [35]:
# for x in models.values():
#     model = x
#     model.fit(X_train, y_train)
#     pred = model.predict(X_test)
#     accuracy = evaluate_model(y_test, pred)

#     print(f'Model Name : {x}')
#     print("Model Training Performance")
#     print("accuracy_score : ", accuracy)

#     print('='*50)
#     print('\n')


In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
classifier = GradientBoostingClassifier()

In [37]:
# parameters = {
#     'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
#     'gamma':['scale', 'auto']
# }

# parameters = {
#     'criterion':['gini', 'entropy', 'log_loss'],
#     'max_features':['sqrt', 'log2']
# }

# parameters = {
#     'n_neighbors':[3, 5, 7, 9],
#     'weights':['uniform', 'distance'],
#     'algorithm':['auto','ball_tree','kd_tree','brute']
# }

parameters = {
    'loss':['log_loss', 'exponential'],
    'criterion':['friedman_mse', 'squared_error']
}

In [38]:
clf = GridSearchCV(classifier, param_grid=parameters, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)

In [39]:
clf.best_params_

{'criterion': 'friedman_mse', 'loss': 'exponential'}