In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
dataset = pd.read_csv("/DATA/diabetes_data.csv", sep = ';')

In [None]:
dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 520 non-null    int64 
 1   gender              520 non-null    object
 2   polyuria            520 non-null    int64 
 3   polydipsia          520 non-null    int64 
 4   sudden_weight_loss  520 non-null    int64 
 5   weakness            520 non-null    int64 
 6   polyphagia          520 non-null    int64 
 7   genital_thrush      520 non-null    int64 
 8   visual_blurring     520 non-null    int64 
 9   itching             520 non-null    int64 
 10  irritability        520 non-null    int64 
 11  delayed_healing     520 non-null    int64 
 12  partial_paresis     520 non-null    int64 
 13  muscle_stiffness    520 non-null    int64 
 14  alopecia            520 non-null    int64 
 15  obesity             520 non-null    int64 
 16  class               520 no

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,Male,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,Male,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,Male,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,Male,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,Male,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [None]:
print(dataset.shape)
print(dataset["class"].unique())

(520, 17)
[1 0]


In [None]:
dataset["gender"] = dataset["gender"].apply({"Male":1, "Female":0}.get)
dataset.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [None]:
X = dataset[dataset.columns[:-1]]
Y = dataset[dataset.columns[-1]]

print(X.shape)
print(Y.shape)

(520, 16)
(520,)


In [None]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X

array([[-0.6613669 ,  0.76509206, -0.99233705, ...,  1.29099445,
         1.38022749,  2.21564684],
       [ 0.82136224,  0.76509206, -0.99233705, ..., -0.77459667,
         1.38022749, -0.45133547],
       [-0.57899306,  0.76509206,  1.00772212, ...,  1.29099445,
         1.38022749, -0.45133547],
       ...,
       [ 0.82136224, -1.30703226,  1.00772212, ...,  1.29099445,
        -0.72451824,  2.21564684],
       [-1.32035762, -1.30703226, -0.99233705, ..., -0.77459667,
         1.38022749, -0.45133547],
       [-0.49661921,  0.76509206, -0.99233705, ..., -0.77459667,
        -0.72451824, -0.45133547]])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X, Y, test_size=0.4, random_state=101)

In [None]:
X_test[0]

array([-0.00237617,  0.76509206,  1.00772212,  1.10984668,  1.18165704,
       -1.19105191,  1.09274555,  1.86621505, -0.90102535, -0.97342978,
       -0.56550593,  1.08431186, -0.86991767, -0.77459667, -0.72451824,
       -0.45133547])

In [None]:
input_shape = [X_train.shape[1]]
print("Input shape: {}".format(input_shape))

Input shape: [16]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, Y_train)

RandomForestClassifier()

In [None]:
predictions_model = model.predict(X_test)

In [None]:
print(classification_report(Y_test, predictions_model))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95        83
           1       0.96      0.98      0.97       125

    accuracy                           0.96       208
   macro avg       0.96      0.96      0.96       208
weighted avg       0.96      0.96      0.96       208



In [None]:
print(confusion_matrix(Y_test, predictions_model))

[[ 78   5]
 [  3 122]]


In [None]:
A = [[40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1]]
B = [[32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0]]

In [None]:
predictions_A = model.predict(A)
predictions_A

array([1])

In [None]:
predictions_B = model.predict(B)
predictions_B

array([0])

In [None]:
#save the model in pickle format
import pickle 
pickle.dump(model,open('model.pkl','wb'))