In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix, classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
from google.colab import drive # Load data
drive.mount('/content/drive')
train_data = pd.read_csv('/content/drive/MyDrive/Masters/ML_Final/Hepatitis-Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Masters/ML_Final/Hepatitis-Test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
print(train_data.shape)

(134, 20)


In [5]:

print(test_data.shape)

(11, 20)


In [6]:
train_data.head()

Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology,TARGET
0,30,Female,no,yes,yes,yes,yes,no,yes,yes,yes,yes,yes,1.0,85,18,4.0,62.16,no,2
1,50,Male,no,yes,no,yes,yes,no,yes,yes,yes,yes,yes,0.9,135,42,3.5,62.16,no,2
2,78,Male,yes,yes,no,yes,yes,yes,yes,yes,yes,yes,yes,0.7,96,32,4.0,62.16,no,2
3,31,Male,no,no,yes,yes,yes,yes,yes,yes,yes,yes,yes,0.7,46,52,4.0,80.0,no,2
4,34,Male,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,1.0,104,200,4.0,62.16,no,2


In [8]:
test_data.head()

Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology,TARGET
0,54,Male,no,yes,no,no,yes,yes,yes,no,yes,no,yes,3.9,120,28,3.5,43.0,yes,1
1,49,Male,no,yes,no,no,yes,yes,yes,no,no,yes,yes,1.4,85,70,3.5,35.0,yes,1
2,45,Male,yes,yes,no,no,no,yes,yes,yes,no,no,yes,1.9,104,114,2.4,62.16,yes,1
3,41,Male,yes,yes,no,yes,yes,yes,no,no,no,yes,no,4.2,65,120,3.4,62.16,yes,1
4,46,Male,yes,yes,no,no,no,yes,yes,yes,no,no,no,7.6,104,242,3.3,50.0,yes,1


In [9]:
train_data.dtypes

Unnamed: 0,0
Age,int64
Sex,object
Steroid,object
Antivirals,object
Fatigue,object
Malaise,object
Anorexia,object
Liver Big,object
Liver Firm,object
Spleen Palpable,object


In [10]:
# Preprocess the data (handle categorical and numeric data)
X_train = train_data.drop("TARGET", axis=1)
y_train = train_data["TARGET"]
X_test = test_data.drop("TARGET", axis=1)
y_test = test_data["TARGET"]

In [12]:
# One hot encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Ensure test and train datasets have the same columns
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [13]:
# Initialize classifiers
svc = LinearSVC(random_state=42, max_iter=10000)
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier()

In [49]:
# function to evaluate and record metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred,zero_division=0)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred,zero_division=0),
        "Recall": recall_score(y_test, y_pred,zero_division=0),
        "F1 Score": f1_score(y_test, y_pred,zero_division=0),
        "Confusion Matrix": cm,
        "Classification Report": cr
    }

# Evaluate each classifier
results = {}
models = {"LinearSVC": svc, "DecisionTree": dt, "RandomForest": rf, "KNN": knn}

for name, model in models.items():
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test)

In [50]:
for name, metrics in results.items():
    print(f"\nModel: {name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("\nClassification Report:")
    print(results[name]['Classification Report'])


Model: LinearSVC
Accuracy: 0.7273
Precision: 0.7143
Recall: 0.8333
F1 Score: 0.7692
Confusion Matrix:
[[5 1]
 [2 3]]

Classification Report:
              precision    recall  f1-score   support

           1       0.71      0.83      0.77         6
           2       0.75      0.60      0.67         5

    accuracy                           0.73        11
   macro avg       0.73      0.72      0.72        11
weighted avg       0.73      0.73      0.72        11


Model: DecisionTree
Accuracy: 0.8182
Precision: 0.8333
Recall: 0.8333
F1 Score: 0.8333
Confusion Matrix:
[[5 1]
 [1 4]]

Classification Report:
              precision    recall  f1-score   support

           1       0.83      0.83      0.83         6
           2       0.80      0.80      0.80         5

    accuracy                           0.82        11
   macro avg       0.82      0.82      0.82        11
weighted avg       0.82      0.82      0.82        11


Model: RandomForest
Accuracy: 0.7273
Precision: 0.7143
Rec

In [51]:
# Hyperparameter tuning for Random Forest
param_distributions = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
random_search = RandomizedSearchCV(
    rf, param_distributions, n_iter=20, cv=3, scoring="accuracy", random_state=42
)
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

# Print the best parameters found by RandomizedSearchCV
print("Best Parameters found by RandomizedSearchCV:")
print(random_search.best_params_)

Best Parameters found by RandomizedSearchCV:
{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 20}


In [52]:
# Evaluate tuned Random Forest
tuned_rf_metrics  = evaluate_model(best_rf, X_train, X_test, y_train, y_test)
results["Tuned RandomForest"] = tuned_rf_metrics

In [53]:
accuracy_tuned_rf = accuracy_score(y_test, best_rf.predict(X_test))
print("\nTuned Random Forest Metrics:")
print(f"Accuracy: {accuracy_tuned_rf:.2f}")
print(f"Precision: {tuned_rf_metrics['Precision']:.2f}")
print(f"Recall: {tuned_rf_metrics['Recall']:.2f}")
print(f"F1 Score: {tuned_rf_metrics['F1 Score']:.2f}")

# Print confusion matrix and classification report
print("\nConfusion Matrix for Tuned Random Forest:")
print(confusion_matrix(y_test, best_rf.predict(X_test)))

print("\nClassification Report for Tuned Random Forest:")
print(classification_report(y_test, best_rf.predict(X_test)))


Tuned Random Forest Metrics:
Accuracy: 0.64
Precision: 0.75
Recall: 0.50
F1 Score: 0.60

Confusion Matrix for Tuned Random Forest:
[[3 3]
 [1 4]]

Classification Report for Tuned Random Forest:
              precision    recall  f1-score   support

           1       0.75      0.50      0.60         6
           2       0.57      0.80      0.67         5

    accuracy                           0.64        11
   macro avg       0.66      0.65      0.63        11
weighted avg       0.67      0.64      0.63        11



In [54]:
# Feature importance from tuned Random Forest
feature_importances = pd.Series(best_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
top_5_features = feature_importances.head(5)

# Display top 5 features
print("Top 5 Important Features from Random Forest:")
print(top_5_features)

Top 5 Important Features from Random Forest:
Albumin        0.141730
Bilirubin      0.114195
Varices_no     0.087366
PROTIME        0.085206
Ascites_yes    0.058633
dtype: float64


In [61]:
# Stacking ensemble
stacking_model = StackingClassifier(
    estimators=[("svc", svc), ("dt", dt), ("rf", rf), ("knn", knn)],
    final_estimator=MLPClassifier(random_state=42,max_iter=430),
)
results["Stacking"] = evaluate_model(stacking_model, X_train, X_test, y_train, y_test)

# Output the results
results_df = pd.DataFrame(results).T
print(results_df)



                    Accuracy Precision    Recall  F1 Score  Confusion Matrix  \
LinearSVC           0.727273  0.714286  0.833333  0.769231  [[5, 1], [2, 3]]   
DecisionTree        0.818182  0.833333  0.833333  0.833333  [[5, 1], [1, 4]]   
RandomForest        0.727273  0.714286  0.833333  0.769231  [[5, 1], [2, 3]]   
KNN                 0.454545       0.0       0.0       0.0  [[0, 6], [0, 5]]   
Tuned RandomForest  0.636364      0.75       0.5       0.6  [[3, 3], [1, 4]]   
Stacking            0.636364      0.75       0.5       0.6  [[3, 3], [1, 4]]   

                                                Classification Report  
LinearSVC                         precision    recall  f1-score   ...  
DecisionTree                      precision    recall  f1-score   ...  
RandomForest                      precision    recall  f1-score   ...  
KNN                               precision    recall  f1-score   ...  
Tuned RandomForest                precision    recall  f1-score   ...  
Stackin