In [18]:
import pandas as pd
import numpy as np
import pickle

In [19]:
# Reading dataset
df = pd.read_csv("diabetes_early.csv")
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


## **Data Cleaning & Data Processing**

### Label Encoding

In [20]:
from sklearn.preprocessing import LabelEncoder

# Inisialisasi LabelEncoder
label_encoder = LabelEncoder()

# List kolom yang akan di-encode (semua kecuali 'Age')
columns_to_encode = df.columns[df.columns != 'Age']

# Mengkodekan variabel kategori ke bentuk numerik menggunakan LabelEncoder
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

In [21]:
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


### Feature Engineering

In [22]:
max_value = df['Age'].max()
min_value = df['Age'].min()

print(f"Range atas untuk kolom Age: {max_value}")
print(f"Range bawah untuk kolom Age: {min_value}")

Range atas untuk kolom Age: 90
Range bawah untuk kolom Age: 16


In [23]:
df.loc[(df['Age'] >= 16) & (df['Age'] <= 25), 'Age_Category'] = 1
df.loc[(df['Age'] > 25) & (df['Age'] <= 35), 'Age_Category'] = 2
df.loc[(df['Age'] > 35) & (df['Age'] <= 45), 'Age_Category'] = 3
df.loc[(df['Age'] > 45) & (df['Age'] <= 55), 'Age_Category'] = 4
df.loc[(df['Age'] > 55) & (df['Age'] <= 65), 'Age_Category'] = 5
df.loc[(df['Age'] > 65) & (df['Age'] <= 75), 'Age_Category'] = 6
df.loc[(df['Age'] > 75) & (df['Age'] <= 85), 'Age_Category'] = 7
df.loc[df['Age'] > 85, 'Age_Category'] = 8

df.drop(['Age'],axis = 1, inplace = True)

# 'Polyuria' dan 'Polydipsia' merupakan dua variabel yang berkorelasi tinggi
df['PolyInteraction'] = df['Polyuria'] * df['Polydipsia']

In [24]:
df

Unnamed: 0,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class,Age_Category,PolyInteraction
0,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1,3.0,0
1,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,5.0,0
2,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1,3.0,0
3,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1,3.0,0
4,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1,3.0,1
516,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1,4.0,1
517,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1,5.0,1
518,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0,2.0,0


### Missing Value

In [25]:
# Before solving this problem, let's check the null values.
df.isnull().sum()

Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
Age_Category          0
PolyInteraction       0
dtype: int64

### Data Splitting

In [26]:
X = df.drop(["class"],axis=1)
y = df["class"]

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
print(f'Total seluruh dataset: {len(df)}')
print(f'Traning dataset: {len(X_train)}')
print(f'Traning dataset: {len(y_train)}')
print(f'Testing dataset: {len(X_test)}')
print(f'Testing dataset: {len(y_test)}')

Total seluruh dataset: 520
Traning dataset: 416
Traning dataset: 416
Testing dataset: 104
Testing dataset: 104


### SMOTE (Handling Imbalanced Data)

In [29]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 42)
X_train, y_train  = sm.fit_resample(X_train, y_train)

### Dimensionality Reduction using PCA

In [30]:
from sklearn.preprocessing import StandardScaler

# Normalisasi data dengan StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [31]:
from sklearn.decomposition import PCA

pca = PCA(n_components=14)
#decreasing 17 features to 14 features

X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

## **MODEL**

### Logistic Regression + SMOTE + PCA

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [34]:
from sklearn.model_selection import cross_val_score, KFold

# Define the logistic regression model
logreg_diab = LogisticRegression(random_state=42)

# Specify the number of folds for cross-validation
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation on X_train and y_train
logres_cv = cross_val_score(logreg_diab, X_train_pca, y_train, cv=kf, scoring='accuracy')

# Print accuracy for each fold in training set
for fold, accuracy in enumerate(logres_cv, 1):
    print(f'Fold {fold}: Accuracy = {accuracy:.4f}')

# Print average accuracy across all folds
average_accuracy = np.mean(logres_cv)
print(f'\nAverage Accuracy Across {num_folds}-Fold Cross-Validation: {average_accuracy:.4f}')

Fold 1: Accuracy = 0.9200
Fold 2: Accuracy = 0.9600
Fold 3: Accuracy = 0.9200
Fold 4: Accuracy = 0.9400
Fold 5: Accuracy = 0.9200
Fold 6: Accuracy = 0.9200
Fold 7: Accuracy = 0.8600
Fold 8: Accuracy = 0.9800
Fold 9: Accuracy = 0.9592
Fold 10: Accuracy = 0.8980

Average Accuracy Across 10-Fold Cross-Validation: 0.9277


In [35]:
# Model using data testing
logreg_diab = LogisticRegression(random_state=42)
logreg_diab.fit(X_train_pca, y_train)
y_pred = logreg_diab.predict(X_test_pca)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))

# Print accuracy
accuracy_after = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy_after)


Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.91      0.94      0.93        33
           1       0.97      0.96      0.96        71

    accuracy                           0.95       104
   macro avg       0.94      0.95      0.94       104
weighted avg       0.95      0.95      0.95       104


Accuracy: 0.9519230769230769


In [None]:
pickle.dump(logreg_diab, open("model.pkl", "wb"))