In [2]:
"""
Data Preprocessing Pipeline for HCV Disease Prediction
========================================================
This notebook handles:
1. Data cleaning and preparation
2. Feature engineering
3. Handling class imbalance
4. Feature scaling
5. Train/test splitting
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn  # Add this import
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
import pickle

# Configuration
warnings.filterwarnings('ignore')
np.random.seed(42)  # For reproducibility
pd.set_option('display.max_columns', None)

# Plot settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")

✅ Libraries imported successfully
Pandas version: 2.3.2
Scikit-learn version: 1.7.1


In [3]:
# Load the explored data
df = pd.read_csv('../data/hcv_data_explored.csv')

print("="*60)
print("INITIAL DATA ASSESSMENT")
print("="*60)
print(f"Dataset shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['Category'].value_counts().sort_index())
print(f"\nMissing values: {df.isnull().sum().sum()}")
print(f"\nDuplicate rows: {df.duplicated().sum()}")

INITIAL DATA ASSESSMENT
Dataset shape: (615, 15)

Target distribution:
Category
0=Blood Donor             533
0s=suspect Blood Donor      7
1=Hepatitis                24
2=Fibrosis                 21
3=Cirrhosis                30
Name: count, dtype: int64

Missing values: 32

Duplicate rows: 0


In [5]:
class DataCleaner:
    """Handle data cleaning operations"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.cleaning_report = {}
    
    def remove_duplicates(self):
        """Remove duplicate rows"""
        initial_shape = self.df.shape[0]
        self.df = self.df.drop_duplicates()
        removed = initial_shape - self.df.shape[0]
        self.cleaning_report['duplicates_removed'] = removed
        return self
    
    def handle_missing_values(self, strategy='drop'):
        """Handle missing values with specified strategy"""
        if strategy == 'drop':
            initial_shape = self.df.shape[0]
            self.df = self.df.dropna()
            removed = initial_shape - self.df.shape[0]
            self.cleaning_report['missing_values_removed'] = removed
        return self
    
    def handle_outliers(self, columns=None, n_std=3):
        """Remove outliers beyond n standard deviations"""
        if columns is None:
            # Get numeric columns
            columns = self.df.select_dtypes(include=[np.number]).columns.tolist()
            # Remove 'Category' if it exists in the list
            if 'Category' in columns:
                columns.remove('Category')
        
        outliers_removed = 0
        for col in columns:
            mean = self.df[col].mean()
            std = self.df[col].std()
            
            # Define outlier boundaries
            lower_bound = mean - n_std * std
            upper_bound = mean + n_std * std
            
            # Count outliers
            outliers = self.df[(self.df[col] < lower_bound) | (self.df[col] > upper_bound)]
            outliers_removed += len(outliers)
            
            # Remove outliers (optional - comment out if you want to keep them)
            # self.df = self.df[(self.df[col] >= lower_bound) & (self.df[col] <= upper_bound)]
        
        self.cleaning_report['outliers_found'] = outliers_removed
        return self
    
    def get_clean_data(self):
        """Return cleaned dataframe"""
        print("\n📊 CLEANING REPORT:")
        print("-" * 40)
        for key, value in self.cleaning_report.items():
            print(f"{key}: {value}")
        return self.df

# Apply cleaning
cleaner = DataCleaner(df)
df_clean = cleaner.remove_duplicates().handle_missing_values().handle_outliers().get_clean_data()
print(f"\nCleaned dataset shape: {df_clean.shape}")


📊 CLEANING REPORT:
----------------------------------------
duplicates_removed: 0
missing_values_removed: 26
outliers_found: 92

Cleaned dataset shape: (589, 15)
