# CS189 HW6
Timothy Quang Nguyen

timotqn2@uci.edu

## Environment Setup

In [1]:
# Task: Predict the Baseline histological staging (multi-class classification)

import numpy as np # for data organization
import pandas as pd # for data organization

import sklearn as sk # for machine learning
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score # For Evaluation
from sklearn.preprocessing import MinMaxScaler # To Scale Features
from sklearn.preprocessing import OneHotEncoder # To Encode Non-Numeric Columns
from sklearn.model_selection import GridSearchCV, train_test_split # For Hyper Parameter Tuning and Data Splitting
from skopt import BayesSearchCV # For Smart Hyper Parameter Tuning

from sklearn.linear_model import LogisticRegression # For Logistic Regression
from sklearn.ensemble import RandomForestClassifier # For Random Forest Classifier
from sklearn.svm import SVC # For Support Vector Classifier
from xgboost import XGBClassifier # For XGB Classifier

import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for plotting

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
hepatitis_c_virus_hcv_for_egyptian_patients = fetch_ucirepo(id=503) 
  
# data (as pandas dataframes) 
X = hepatitis_c_virus_hcv_for_egyptian_patients.data.features 
y = hepatitis_c_virus_hcv_for_egyptian_patients.data.targets 
  
# metadata 
print(hepatitis_c_virus_hcv_for_egyptian_patients.metadata) 
  
# variable information 
print(hepatitis_c_virus_hcv_for_egyptian_patients.variables) 

{'uci_id': 503, 'name': 'Hepatitis C Virus (HCV) for Egyptian patients', 'repository_url': 'https://archive.ics.uci.edu/dataset/503/hepatitis+c+virus+hcv+for+egyptian+patients', 'data_url': 'https://archive.ics.uci.edu/static/public/503/data.csv', 'abstract': 'Egyptian patients who underwent treatment dosages for HCV about 18 months. Discretization should be applied based on expert recommendations; there is an attached file shows how.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1385, 'num_features': 28, 'feature_types': ['Real'], 'demographics': ['Age', 'Gender'], 'target_col': ['Baselinehistological staging'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2017, 'last_updated': 'Tue Apr 09 2024', 'dataset_doi': '10.24432/C5989V', 'creators': ['Sanaa Kamal', 'Mohamed ElEleimy', 'Doaa Hegazy', 'Mahmoud Nasr'], 'intro_paper': {'ID': 232, 'type': 'NATIVE', 'ti

## Function Declarations

In [None]:
def evaluate_model(y_true, y_pred, label_nums, label_names):
    """
    Comprehensive evaluation for classification models.
    
    Parameters:
        y_true: True labels
        y_pred: Predicted labels
        label_nums: Numeric form of labels
        label_names: Name of labels
    """
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=label_nums)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()
    
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average="macro"),
        'Recall': recall_score(y_true, y_pred, average="macro"),
        'F1 Score': f1_score(y_true, y_pred, average="macro")
    }

    # Print formatted metrics
    print("METRICS")
    print("="*40)
    for name, value in metrics.items():
        print(f'name:\t')