In [3]:
#loading requisite libraries for this step of the project
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)

In [None]:
#setting default values for visualizations
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (14, 10)
matplotlib.rcParams['figure.facecolor'] = '#000000'

In [None]:
#loading the preprocessed dataset from step I
df_1 = pd.read_csv('../datasets/synthesized_diabetes_data.csv')

EXPLORING THE DATA

In [None]:
df_1

Unnamed: 0,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,...,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn,height_meters,BMI,hip_waist_ratio,diabetic
0,156.700394,31,52.154896,3.314927,4.299748,1,7,1,68.496912,146.234609,...,85.186571,151.973658,92.697071,29.222653,34.012633,761.958003,1.667030,53.028606,0.815154,0
1,205.708528,50,74.558454,2.397619,4.750497,1,50,0,67.514518,155.252176,...,88.848128,152.708004,92.802874,32.887620,40.640470,715.021735,1.681850,42.542497,0.855346,1
2,184.259740,39,50.107744,2.436952,5.033467,1,56,0,63.226992,134.186382,...,94.615906,153.064309,92.212116,32.548504,36.741043,210.965098,1.565997,54.781513,0.863442,0
3,236.859522,40,46.163972,3.794523,5.382256,0,51,0,66.799333,202.246191,...,79.333852,151.949308,92.541618,45.726840,50.127763,801.800590,1.752456,65.815471,0.851003,1
4,189.616025,92,40.045044,3.633247,4.634788,1,10,1,69.283944,186.218617,...,67.017735,151.475941,92.036697,32.192694,41.216135,160.570897,1.779865,58.610103,0.890379,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,257.604430,98,35.771984,5.118372,4.753262,0,18,0,61.331854,235.607829,...,102.549858,152.100496,83.917625,35.150511,53.406133,88.264917,1.529035,68.481644,0.855944,1
6496,162.296900,45,35.999784,5.681376,4.227569,1,4,0,65.386967,296.862677,...,84.701618,125.647137,82.484329,39.052591,59.843454,443.647170,1.656517,80.007193,0.830202,0
6497,187.502574,57,33.007135,4.111922,5.364477,1,20,0,66.676238,190.406558,...,76.921248,151.736385,93.133092,49.009939,64.824856,187.248003,1.722115,115.404367,0.820453,1
6498,182.580909,32,52.969260,2.549922,4.153793,1,16,0,61.393461,132.810153,...,66.361413,151.984054,92.246495,31.932760,46.220675,895.800788,1.542742,65.397955,0.777446,0


In [None]:
df_1.shape

(6500, 22)

In [None]:
df_1.columns

Index(['chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'location', 'age',
       'gender', 'height', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s',
       'bp.2d', 'waist', 'hip', 'time.ppn', 'height_meters', 'BMI',
       'hip_waist_ratio', 'diabetic'],
      dtype='object')

In [None]:
df_1.isnull().values.any()

False

In [None]:
df_1.isnull().sum().sum()

0

In [None]:
df_1.columns

In [None]:
X = df_1.drop(['diabetic'])
Y = df_1['diabetic']

In [None]:
#splitting the dataset into traning and testing datasets 
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=17)

In [None]:
#scaling the dataset to reduce dimensionality and for easier and faster training
x_train_scaled =  StandardScaler().fit_transform(x_train)
x_test_scaled = StandardScaler().fit_transform(x_test)

In [None]:
#an auto-training function for the three models
def auto_train(x, y, model_alg):
    if model_alg == 'rf':
        model = RandomForestClassifier(random_state=30, n_estimators=100, max_features=8, max_depth=20)
    elif model_alg == 'svm':
        model = SVC(C=1, kernel='rbf', random_state=1)
    elif model_alg == 'xgb':
        model = XGBClassifier(use_label_encoder=False, objective='binary:hinge', colsample_bytree=0.8)
        
    model.fit(x_train_scaled, y_train)
    
    return model

In [None]:
#an evaluation function for the three models.
def auto_evaluate(model, x, y):
    
    predicted = model.predict(x)
    
    mae = mean_absolute_error(y, predicted)
    rmse = np.sqrt(mean_squared_error(y, predicted))
    rac = roc_auc_score(y, predicted)
    clas = classification_report(y, predicted)
    
    return [mae, rmse, rac, clas]

In [None]:
#fitting and training the models using a function call
random_forest = auto_train(x_train_scaled, y_train, 'rf')
svm = auto_train(x_train_scaled, y_train, 'svm')
gradient_boost = auto_train(x_train_scaled, y_train, 'xgb')

In [None]:
#model outputs; SVM
svm_output = auto_evaluate(svm, x_test_scaled, y_test)
svm_output

[0.15923076923076923,
 0.39903730305670576,
 0.7810967485305941,
 '              precision    recall  f1-score   support\n\n           0       0.81      0.63      0.71       399\n           1       0.85      0.94      0.89       901\n\n    accuracy                           0.84      1300\n   macro avg       0.83      0.78      0.80      1300\nweighted avg       0.84      0.84      0.83      1300\n']

In [None]:
#model outputs; RF
random_forest_output = auto_evaluate(random_forest, x_test_scaled, y_test)
random_forest_output

[0.1423076923076923,
 0.37723691800736087,
 0.8044765075841658,
 '              precision    recall  f1-score   support\n\n           0       0.84      0.67      0.74       399\n           1       0.86      0.94      0.90       901\n\n    accuracy                           0.86      1300\n   macro avg       0.85      0.80      0.82      1300\nweighted avg       0.86      0.86      0.85      1300\n']

In [None]:
#model outputs; XGB
xgb_output = auto_evaluate(gradient_boost, x_test_scaled, y_test)
xgb_output

[0.16384615384615384,
 0.4047791420591652,
 0.7959201555498068,
 '              precision    recall  f1-score   support\n\n           0       0.75      0.69      0.72       399\n           1       0.87      0.90      0.88       901\n\n    accuracy                           0.84      1300\n   macro avg       0.81      0.80      0.80      1300\nweighted avg       0.83      0.84      0.83      1300\n']