In [1]:
"""
Model Training for HCV Disease Prediction
==========================================
Training multiple classification algorithms to predict HCV disease stages
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, auc)
from sklearn.model_selection import cross_val_score, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

print("Libraries loaded successfully")

Libraries loaded successfully


In [2]:
# Load preprocessed data
data_dir = '../data/processed/'

X_train = pd.read_csv(f'{data_dir}X_train.csv')
X_val = pd.read_csv(f'{data_dir}X_val.csv')
X_test = pd.read_csv(f'{data_dir}X_test.csv')

y_train = pd.read_csv(f'{data_dir}y_train.csv')['Category']
y_val = pd.read_csv(f'{data_dir}y_val.csv')['Category']
y_test = pd.read_csv(f'{data_dir}y_test.csv')['Category']

print("Data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

print(f"\nClass distribution in training (balanced with SMOTE):")
print(y_train.value_counts().sort_index())

Data shapes:
X_train: (1264, 22), y_train: (1264,)
X_val: (118, 22), y_val: (118,)
X_test: (118, 22), y_test: (118,)

Class distribution in training (balanced with SMOTE):
Category
0    316
1    316
2    316
3    316
Name: count, dtype: int64


In [None]:
class ModelTrainer ():
    def __init__(self):
        self.models = {}
        self.results = {}
        self.best = None
        self.score = 0

    def select_model(self, name, model):
        self.models[name] = model

    def train_model(self, X_train, X_val, y_train, y_val):
        """Train a single model and evaluate on validation set"""

        print(f"\nTraining {name}...")
        start_time = time.time()

        # Train
        model = self.models[name]
        model.fit(X_train, y_train)

        # Predict
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        # Calculate metrics
        train_acc = accuracy_score(y_train, y_train_pred)
        val_acc = accuracy_score(y_val, y_val_pred)
        val_f1 = f1_score(y_val, y_val_pred, average='weighted')
        val_precision = precision_score(y_val, y_val_pred, average='weighted', zero_division=0)
        val_recall = recall_score(y_val, y_val_pred, average='weighted')
        
        # Training time
        training_time = time.time() - start_time
        
        # Store results
        self.results[name] = {
            'model': model,
            'train_accuracy': train_acc,
            'val_accuracy': val_acc,
            'val_f1': val_f1,
            'val_precision': val_precision,
            'val_recall': val_recall,
            'training_time': training_time,
            'y_val_pred': y_val_pred
        }
        
        print(f"  Training Accuracy: {train_acc:.4f}")
        print(f"  Validation Accuracy: {val_acc:.4f}")
        print(f"  Validation F1-Score: {val_f1:.4f}")
        print(f"  Training Time: {training_time:.2f} seconds")
        
        # Update best model
        if val_f1 > self.best_score:
            self.best_score = val_f1
            self.best_model = name
            
        return model
    
    def train_all(self, X_train, y_train, X_val, y_val):
        """Train all models"""
        for name in self.models.keys():
            self.train_model(name, X_train, y_train, X_val, y_val)
        
        print(f"\n🏆 Best Model: {self.best_model} (F1-Score: {self.best_score:.4f})")
        
    def get_results_dataframe(self):
        """Return results as a sorted dataframe"""
        df = pd.DataFrame(self.results).T
        df = df.sort_values('val_f1', ascending=False)
        return df[['train_accuracy', 'val_accuracy', 'val_f1', 'val_precision', 'val_recall', 'training_time']]

In [None]:
#Todo 
# LogisticRegression
# Random Forest Classifier
# XGBoost