In [125]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from scipy import stats
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

__Data Preprocessing and Data cleaning__

In [127]:
df = pd.read_csv('diabetes_final.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [129]:
df.duplicated().sum()

3854

In [130]:
df_clean = df.drop_duplicates()

In [131]:
df_clean.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [132]:
label_encoder = LabelEncoder()
df_clean['gender'] = label_encoder.fit_transform(df_clean['gender'])
df_clean['smoking_history'] = label_encoder.fit_transform(df_clean['smoking_history'])

df_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['gender'] = label_encoder.fit_transform(df_clean['gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['smoking_history'] = label_encoder.fit_transform(df_clean['smoking_history'])


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [133]:
features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

In [134]:
plt.figure(figsize = (15,10))

#Iterate through each column and plot on a separate subplot
for i, column in enumerate(df_clean[features].columns):
    plt.subplot(7,3, i+1)
    sns.boxplot(df_clean[column])
    plt.title(f'Box plot of {column}')

#Adjust layout and show the plot
plt.tight_layout()
plt.show()

  plt.show()


In [135]:
filtered_data = df_clean[np.abs(stats.zscore(df_clean[features])) <= 3]

In [136]:
plt.figure(figsize = (15,10))

#Iterate through each column and plot on a separate subplot
for i, column in enumerate(filtered_data[features].columns):
    plt.subplot(7,3, i+1)
    sns.boxplot(filtered_data[column])
    plt.title(f'Box plot of {column}')

#Adjust layout and show the plot
plt.tight_layout()
plt.show()

  plt.show()


In [137]:
df_clean

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,0,36.0,0,0,0,24.60,4.8,145,0
99996,0,2.0,0,0,0,17.37,6.5,100,0
99997,1,66.0,0,0,3,27.83,5.7,155,0
99998,0,24.0,0,0,4,35.42,4.0,100,0


In [138]:
X= df_clean.copy()
X.drop('diabetes', axis =1, inplace=True)

In [139]:
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,80.0,0,1,4,25.19,6.6,140
1,0,54.0,0,0,0,27.32,6.6,80
2,1,28.0,0,0,4,27.32,5.7,158
3,0,36.0,0,0,1,23.45,5.0,155
4,1,76.0,1,1,1,20.14,4.8,155


In [140]:
y = df_clean['diabetes']
y

0        0
1        0
2        0
3        0
4        0
        ..
99994    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 96146, dtype: int64

__Data Modelling__

In [142]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

Data Splitting (70-30)

In [144]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=4)

1. Decision Tree

In [146]:
L =[]
for i in range(3,30):
    ml_model = DecisionTreeClassifier(max_depth = i)
    # model = models[ml_model]
    ml_model.fit(X_train, y_train)
    y_pred = ml_model.predict(X_test)
    L.append(round(accuracy_score(y_test,y_pred),4))


print(max(L))

0.9723


In [147]:
print("[INFO] using '{}' model".format(DecisionTreeClassifier))
model = DecisionTreeClassifier(max_depth = 7)
model.fit(X_train, y_train)

[INFO] using '<class 'sklearn.tree._classes.DecisionTreeClassifier'>' model


In [148]:
print("[INFO] evaluating...")
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

print("Test set Accuracy: ", round(accuracy_score(y_test, predictions),4))

[INFO] evaluating...
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     26287
           1       1.00      0.69      0.81      2557

    accuracy                           0.97     28844
   macro avg       0.99      0.84      0.90     28844
weighted avg       0.97      0.97      0.97     28844

Test set Accuracy:  0.9723


In [149]:
from sklearn.metrics import classification_report, confusion_matrix
conf_mat = confusion_matrix(y_test,predictions)
print(conf_mat)

[[26287     0]
 [  800  1757]]


2. Random Forest

In [151]:
from sklearn.metrics import roc_auc_score
# Memanggil model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Melaith model
rf_model.fit(X_train, y_train)

# Prediksi
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("ROC-AUC:", roc_auc)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Akurasi:", accuracy)

ROC-AUC: 0.843547658894012
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     26287
           1       1.00      0.69      0.81      2557

    accuracy                           0.97     28844
   macro avg       0.98      0.84      0.90     28844
weighted avg       0.97      0.97      0.97     28844

Akurasi: 0.9722299265011788


In [152]:
from sklearn.metrics import classification_report, confusion_matrix
conf_mat = confusion_matrix(y_test,y_pred)
print(conf_mat)

[[26286     1]
 [  800  1757]]


__Model Optimization__

In [154]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
diabetes_data = pd.read_csv("diabetes_final.csv")

In [155]:
# Encode categorical features
le_gender = LabelEncoder()
le_smoking = LabelEncoder()
diabetes_data['gender'] = le_gender.fit_transform(diabetes_data['gender'])
diabetes_data['smoking_history'] = le_smoking.fit_transform(diabetes_data['smoking_history'])

In [156]:
# Define feature and target variables
X = pd.DataFrame(diabetes_data.drop("diabetes", axis=1))
y = diabetes_data["diabetes"]

In [157]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test 
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [158]:
# Feature Scaling
scaler = StandardScaler()
X_train[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']] = scaler.fit_transform(X_train[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']])
X_test[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']] = scaler.transform(X_test[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']])

In [159]:
# Data Balancing: Oversampling 
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [160]:
# Model Optimization: Decision Tree with GridSearch for max_depth tuning only
param_grid_tree = {
    "max_depth": [3, 5, 7, 10, 15, None],
    "class_weight": ["balanced"]
}
optimized_dt_clf = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid_tree,
    cv=5,
    scoring="recall", 
    n_jobs=-1,
    verbose=1
)
optimized_dt_clf.fit(X_resampled, y_resampled)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [161]:
# Best parameters for Decision Tree
print("Best parameters for optimized Decision Tree:", optimized_dt_clf.best_params_)
print("Best recall score for optimized Decision Tree:", optimized_dt_clf.best_score_)

Best parameters for optimized Decision Tree: {'class_weight': 'balanced', 'max_depth': None}
Best recall score for optimized Decision Tree: 0.9663703607948373


In [162]:
print("[INFO] evaluating optimized model...")
optimized_predictions = optimized_dt_clf.predict(X_test)

[INFO] evaluating optimized model...


In [163]:
print(classification_report(y_test, optimized_predictions))
print("Test set Accuracy (Optimized Model): ", round(accuracy_score(y_test, optimized_predictions), 4))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97     18292
           1       0.68      0.75      0.71      1708

    accuracy                           0.95     20000
   macro avg       0.83      0.86      0.84     20000
weighted avg       0.95      0.95      0.95     20000

Test set Accuracy (Optimized Model):  0.9489


In [164]:
optimized_conf_mat = confusion_matrix(y_test, optimized_predictions)
print("Confusion Matrix (Optimized Model):\n", optimized_conf_mat)

Confusion Matrix (Optimized Model):
 [[17698   594]
 [  428  1280]]


In [165]:
import streamlit as st

In [166]:
# Title of the app
st.title("Database Prediction")

# User input for a number
user_input = st.number_input("Enter a number", min_value=1, max_value=100, value=25)

# Display a simple message based on input
st.write(f"User input is: {user_input}")
st.write(f"The square of the input is: {user_input ** 2}")