# Assignment 3: The Operative (Advanced Level)

## Part 1 - Data Preprocessing

### Importing the libraries

In [107]:
import pandas as pd
import numpy as np
import streamlit as st
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

### Importing the dataset

In [21]:
dataset = pd.read_csv("alzheimers_disease_data.csv")

In [22]:
dataset.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [23]:
#check missing data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

### Dropping the 'DoctorInCharge' column as it contains confidential information

In [25]:
dataset.drop('DoctorInCharge', axis=1, inplace=True)

In [26]:
#checking balance of dataset
dataset.Diagnosis.value_counts()

Diagnosis
0    1389
1     760
Name: count, dtype: int64

In [148]:
X = dataset.drop("Diagnosis", axis=1)
y = dataset["Diagnosis"]

In [89]:
# Compute cloass weights to handle imbalance
class_weights = compute_class_weight(class_weight = 'balanced', classes=np.array([0,1]), y=y)
class_weights_dict = {0:class_weights[0], 1:class_weights[1]}

In [28]:
def evaluate_model(y_test, y_pred):

  accuracy = accuracy_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)
  class_report = classification_report(y_test, y_pred)

  print("Accuracy: {:.2f} %".format(accuracy*100))
  print()
  print("Confusion Matrix:")
  print(conf_matrix)
  print()
  print("Classification Report:")
  print(class_report)


### Getting the matrix of features X and the dependent variable vector y while applying Feature Selection

In [245]:
mi_scores = mutual_info_classif(X, y)
mi_df = pd.DataFrame({"Feature": X.columns, "MI Score": mi_scores})
mi_df.sort_values(by="MI Score", ascending=False, inplace=True)

In [247]:
mi_df

Unnamed: 0,Feature,MI Score
24,FunctionalAssessment,0.095454
27,ADL,0.076564
23,MMSE,0.066278
25,MemoryComplaints,0.044373
26,BehavioralProblems,0.024555
0,PatientID,0.021535
7,AlcoholConsumption,0.01778
21,CholesterolHDL,0.017349
3,Ethnicity,0.012432
1,Age,0.009938


In [249]:
#now select top five features
selector = SelectKBest(score_func=mutual_info_classif, k=5)
X = selector.fit_transform(X, y)

In [251]:
X

array([[21.46353236,  6.51887697,  0.        ,  0.        ,  1.72588346],
       [20.61326731,  7.1186955 ,  0.        ,  0.        ,  2.59242413],
       [ 7.35624862,  5.89507735,  0.        ,  0.        ,  7.11954774],
       ...,
       [17.0110031 ,  1.97213657,  0.        ,  0.        ,  5.03633399],
       [ 4.03049088,  5.17389096,  0.        ,  0.        ,  3.78539871],
       [11.11477737,  6.30754331,  0.        ,  1.        ,  8.32756301]])

In [255]:
selected_features = selector.get_support(indices = True)

In [257]:
best_feature_names = dataset.columns[selected_features]
best_feature_names

Index(['MMSE', 'FunctionalAssessment', 'MemoryComplaints',
       'BehavioralProblems', 'ADL'],
      dtype='object')

### Creating the Training Set and the Test Set




In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Part 2 - Building and Training the model

### Specify baseline models

In [36]:
#fit, predict and evaluate base models

models = [RandomForestClassifier(),  XGBClassifier(), LGBMClassifier(verbose = -1), CatBoostClassifier(silent = True), GradientBoostingClassifier() ]

for model in models:
  print()
  print(model)
  model.fit(X_train, y_train)
  evaluate_model(y_test, model.predict(X_test))
  cv_scores = cross_val_score(model, X, y, cv=5)
  print("Cross-validation accuracy: {:.2f} %".format(cv_scores.mean()*100))
  print('*' * 50)


RandomForestClassifier()
Accuracy: 95.12 %

Confusion Matrix:
[[272   5]
 [ 16 137]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       277
           1       0.96      0.90      0.93       153

    accuracy                           0.95       430
   macro avg       0.95      0.94      0.95       430
weighted avg       0.95      0.95      0.95       430

Cross-validation accuracy: 94.50 %
**************************************************

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_oneho

### Tune the baseline models

In [102]:
#Define the models with the hyperparameters

rf_model = RandomForestClassifier(random_state=42, class_weight=class_weights_dict)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None]
}

catboost_model = CatBoostClassifier(random_seed=42, class_weights=[class_weights[0], class_weights[1]], verbose=0)
catboost_param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.1, 0.2, 0.3]
}

xgb_model = XGBClassifier(random_state=42, scale_pos_weight=class_weights[1]/class_weights[0])
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.2, 0.3]
}

lgbm_model = LGBMClassifier(random_state = 42, class_weight=class_weights_dict, verbose = -1)
lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.2, 0.3]
}
gbc_model = GradientBoostingClassifier(random_state = 42) #apply sample weights during fitting
gbc_param_grid = {
    'learning_rate': [0.2], 
    'max_depth': [7], 
    'n_estimators' : [200]
}

#### Running RandomizedSearchCV to get best parameters

In [109]:
# Perform RandomizedSearchCV for each model
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, n_iter=10, cv=3, verbose=1, n_jobs=-1)
catboost_random = RandomizedSearchCV(estimator=catboost_model, param_distributions=catboost_param_grid, n_iter=10, cv=3, verbose=1, n_jobs=-1)
xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_grid, n_iter=10, cv=3, verbose=1, n_jobs=-1)
lgbm_random = RandomizedSearchCV(estimator=lgbm_model, param_distributions=lgbm_param_grid, n_iter=10, cv=3, verbose=1, n_jobs=-1)
gbc_random = RandomizedSearchCV(estimator=gbc_model, param_distributions=gbc_param_grid, n_iter=10, cv=3, verbose=1, n_jobs=-1)


In [118]:
# Fit the models with hyperparameter tuning
rf_random.fit(X_train, y_train)
catboost_random.fit(X_train, y_train)
xgb_random.fit(X_train, y_train)
lgbm_random.fit(X_train, y_train)
gbc_random.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits




Fitting 3 folds for each of 9 candidates, totalling 27 fits




Fitting 3 folds for each of 9 candidates, totalling 27 fits




Fitting 3 folds for each of 9 candidates, totalling 27 fits




Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [121]:
print("Best parameters for Random Forest:", rf_random.best_params_)
print("Best parameters for CatBoost:", catboost_random.best_params_)
print("Best parameters for XGBoost:", xgb_random.best_params_)
print("Best parameters for LightGBM:", lgbm_random.best_params_)
print("Best parameters for Gradient Boosting:", gbc_random.best_params_)

Best parameters for Random Forest: {'n_estimators': 200, 'max_depth': 10}
Best parameters for CatBoost: {'learning_rate': 0.2, 'iterations': 100}
Best parameters for XGBoost: {'n_estimators': 100, 'learning_rate': 0.1}
Best parameters for LightGBM: {'n_estimators': 100, 'learning_rate': 0.1}
Best parameters for Gradient Boosting: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2}


### Building an Ensemble model using stacking

In [127]:
# Define meta-model
meta_model = LogisticRegression()

model_1 = rf_random.best_estimator_
model_2 = lgbm_random.best_estimator_
model_3 = catboost_random.best_estimator_
model_4 = xgb_random.best_estimator_
model_5 = gbc_random.best_estimator_
ensemble_model = StackingClassifier(estimators=[('model_1', model_1),
                                              ('model_2', model_2),
                                              ('model_3', model_3),
                                              ('model_4', model_4),
                                              ('model_5', model_5)], final_estimator=meta_model )

In [131]:
ensemble_model.fit(X_train, y_train)

In [None]:
# Define the hyperparameter grid for the meta-model (Logistic Regression)
meta_param_grid = {
    'final_estimator__C': [0.01, 0.1, 1.0, 10.0],
    'final_estimator__penalty': ['l1', 'l2']
}

# Perform grid search on the StackingClassifier
meta_grid_search = GridSearchCV(estimator=ensemble_model, param_grid=meta_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
meta_grid_search.fit(X_train, y_train)

# Get the best Stacking Classifier with tuned meta-model
best_stacking_model = meta_grid_search.best_estimator_
print("Best meta-model parameters:", meta_grid_search.best_params_)

## Part 3 - Evaluating and Tuning the model

### Making predictions on the test set

In [135]:
y_pred = best_stacking_model.predict(X_test)

### Evaluating the model on the test set

In [137]:
evaluate_model(y_test, y_pred)

Accuracy: 96.05 %

Confusion Matrix:
[[272   5]
 [ 12 141]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       277
           1       0.97      0.92      0.94       153

    accuracy                           0.96       430
   macro avg       0.96      0.95      0.96       430
weighted avg       0.96      0.96      0.96       430



In [140]:
cv_scores = cross_val_score(best_stacking_model, X, y, cv=5)
print("Cross-validation accuracy: {:.2f} %".format(cv_scores.mean()*100))

Cross-validation accuracy: 95.01 %


In [144]:
#save the model for deployment
import joblib
joblib.dump(model, 'alzeimer_predictor_model.pkl')

['alzeimer_predictor_model.pkl']

#### Make a prediction for a single case and also get probability of prediction

In [331]:
input = [1,1,0,1,10]
prediction = best_stacking_model.predict([input])
prediction

array([1], dtype=int64)

In [333]:
prob = best_stacking_model.predict_proba([input])
prob = prob.tolist()[0]
prob

[0.06583375364815303, 0.934166246351847]

## Part 4: Deploying the saved best ensemble model through a Streamlit App

In [None]:
'''
import streamlit as st
import joblib
import numpy as np
# load the model
model = joblib.load('alzeimer_predictor_model.pkl')

# Get user input
st.write("""### Enter Patient Data
""")
f_a_score = st.text_input("Functional Assessment Score (0-10)", value="0.0")
adl_score = st.text_input(label = "Activities of Daily Living (ADL) Score (0-10)", value = "0.0")
mmse = st.text_input("Mini-Mental State Exam (MMSE) Score (0-30)", value="0.0")

memory_complaints = st.radio("Memory Complaints", ('No', 'Yes'))
behavioral_problems = st.radio("Behavioral Problems", ('No', 'Yes'))

# Encode features as categorical variables in order to use radio buttons in app
memory_complaints = 1 if memory_complaints =='Yes' else 0
behavioral_problems = 1 if behavioral_problems == 'Yes' else 0 

# Run prediction
if st.button('Get Prediction'):
    try:
        f_a_score = float(f_a_score)
        adl_score = float(adl_score)
        mmse = float(mmse)
        # Check if the values are within the predefined ranges
        if not (0 <= f_a_score <= 10):
            st.error("Functional Assessment Score must be between 0 and 10.")
        elif not (0 <= adl_score <= 10):
            st.error("ADL Score must be between 0 and 10.")
        elif not (0 <= mmse <= 30):
            st.error("MMSE Score must be between 0 and 30.")
        else:
            input_data = np.array([[mmse, f_a_score, memory_complaints, behavioral_problems, adl_score]])    
          
            prediction = model.predict(input_data)
            st.write(f"Prediction: {'Alzheimer’s Likely' if prediction[0] == 1 else 'Alzheimer’s not Likely'}")
            prediction_probability = model.predict_proba(input_data)
            prob = prediction_probability.tolist()[0]
            st.write(f"Prediction Confidence: Alzheimer’s {prob[1]* 100:.2f}%  ------ No Alzheimer’s {prob[0]* 100:.2f}%")
    except ValueError:
        st.error("Please ensure that all inputs are within the ranges stated")
        '''