# Model Klasifikasi Alzheimer

Dataset ini berisi data pasien untuk memprediksi diagnosis Alzheimer. Kita akan membuat model machine learning untuk klasifikasi.

In [23]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data

In [24]:
df = pd.read_csv('alzheimers_disease_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (2149, 35)


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [None]:
print(df.info())
print(f"\nDistribusi Diagnosis:")
print(df['Diagnosis'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

## 2. Data Preprocessing

In [26]:
df_clean = df.drop(['PatientID', 'DoctorInCharge'], axis=1)

X = df_clean.drop('Diagnosis', axis=1)
y = df_clean['Diagnosis']

print(f"\nMissing values in features: {X.isnull().sum().sum()}")
print(f"Missing values in target: {y.isnull().sum()}")


Missing values in features: 0
Missing values in target: 0


## 3. Split Data (80% Train, 20% Test)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  
)

print(f"Training set size: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nDistribusi kelas di Training set:")
print(y_train.value_counts())
print(f"\nDistribusi kelas di Test set:")
print(y_test.value_counts())

Training set size: 1719 samples (80.0%)
Test set size: 430 samples (20.0%)

Distribusi kelas di Training set:
Diagnosis
0    1111
1     608
Name: count, dtype: int64

Distribusi kelas di Test set:
Diagnosis
0    278
1    152
Name: count, dtype: int64


## 4. Feature Scaling

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Mean training set (after scaling): {X_train_scaled.mean():.6f}")
print(f"Std training set (after scaling): {X_train_scaled.std():.6f}")

Mean training set (after scaling): -0.000000
Std training set (after scaling): 1.000000


## 5. Train Model (Random Forest Classifier)

In [29]:
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,20
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 6. Model Evaluation (Accuracy, Precision, Recall)

In [30]:
y_train_pred = model.predict(X_train_scaled)

y_test_pred = model.predict(X_test_scaled)

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average='weighted', zero_division=0)
train_recall = recall_score(y_train, y_train_pred, average='weighted', zero_division=0)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)

print(f"  Accuracy  : {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"  Precision : {train_precision:.4f} ({train_precision*100:.2f}%)")
print(f"  Recall    : {train_recall:.4f} ({train_recall*100:.2f}%)")

print(f"  Accuracy  : {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"  Precision : {test_precision:.4f} ({test_precision*100:.2f}%)")
print(f"  Recall    : {test_recall:.4f} ({test_recall*100:.2f}%)")

  Accuracy  : 0.9837 (98.37%)
  Precision : 0.9839 (98.39%)
  Recall    : 0.9837 (98.37%)
  Accuracy  : 0.9395 (93.95%)
  Precision : 0.9397 (93.97%)
  Recall    : 0.9395 (93.95%)


In [None]:
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       278
           1       0.94      0.88      0.91       152

    accuracy                           0.94       430
   macro avg       0.94      0.93      0.93       430
weighted avg       0.94      0.94      0.94       430

[[270   8]
 [ 18 134]]


## 7. Export Model ke File Pickle

In [32]:
model_filename = 'alzheimer_model.pkl'
scaler_filename = 'alzheimer_scaler.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)



## 8. Test Load Model 

In [33]:
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

with open(scaler_filename, 'rb') as file:
    loaded_scaler = pickle.load(file)

test_predictions = loaded_model.predict(X_test_scaled)
loaded_accuracy = accuracy_score(y_test, test_predictions)


## 9. Feature Importance (Top 10)

In [34]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance.head(10).iterrows():
    print(f"{row['feature']:30s} : {row['importance']:.4f}")

FunctionalAssessment           : 0.1945
ADL                            : 0.1871
MMSE                           : 0.1225
MemoryComplaints               : 0.0973
BehavioralProblems             : 0.0493
BMI                            : 0.0283
CholesterolTriglycerides       : 0.0277
DietQuality                    : 0.0276
SleepQuality                   : 0.0274
CholesterolHDL                 : 0.0268


## 10. Test Case

In [35]:
with open('alzheimer_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
    
with open('alzheimer_scaler.pkl', 'rb') as f:
    loaded_scaler = pickle.load(f)


dummy_data = pd.DataFrame({
    'Age': [65, 85],
    'Gender': [1, 0],
    'Ethnicity': [2, 1],
    'EducationLevel': [3, 0],
    'BMI': [24.5, 19.2],
    'Smoking': [0, 1],
    'AlcoholConsumption': [5.0, 18.5],
    'PhysicalActivity': [8.0, 2.0],
    'DietQuality': [7.5, 1.2],
    'SleepQuality': [8.0, 4.5],
    'FamilyHistoryAlzheimers': [0, 1],
    'CardiovascularDisease': [0, 1],
    'Diabetes': [0, 1],
    'Depression': [0, 1],
    'HeadInjury': [0, 1],
    'Hypertension': [0, 1],
    'SystolicBP': [120, 165],
    'DiastolicBP': [80, 95],
    'CholesterolTotal': [180.0, 280.0],
    'CholesterolLDL': [100.0, 190.0],
    'CholesterolHDL': [60.0, 30.0],
    'CholesterolTriglycerides': [120.0, 340.0],
    'MMSE': [28.0, 5.0], 
    'FunctionalAssessment': [9.0, 1.5],  
    'MemoryComplaints': [0, 1],
    'BehavioralProblems': [0, 1],
    'ADL': [1.0, 8.5],  
    'Confusion': [0, 1],
    'Disorientation': [0, 1],
    'PersonalityChanges': [0, 1],
    'DifficultyCompletingTasks': [0, 1],
    'Forgetfulness': [0, 1]
})


# print(dummy_data.T)

dummy_data_scaled = loaded_scaler.transform(dummy_data)

predictions = loaded_model.predict(dummy_data_scaled)
prediction_proba = loaded_model.predict_proba(dummy_data_scaled)


for i in range(len(dummy_data)):
    print(f"   Diagnosis Prediksi : {'Alzheimer' if predictions[i] == 1 else 'Tidak Alzheimer'} (Class {predictions[i]})")
    print(f"   Probabilitas:")
    print(f"      - Tidak Alzheimer : {prediction_proba[i][0]*100:.2f}%")
    print(f"      - Alzheimer       : {prediction_proba[i][1]*100:.2f}%")

   Diagnosis Prediksi : Tidak Alzheimer (Class 0)
   Probabilitas:
      - Tidak Alzheimer : 93.08%
      - Alzheimer       : 6.92%
   Diagnosis Prediksi : Alzheimer (Class 1)
   Probabilitas:
      - Tidak Alzheimer : 22.16%
      - Alzheimer       : 77.84%
