In [1]:
"""
compare.py - Model Comparison Framework

This module provides functions to load data, split it into train/test sets,
and compare the performance of different classification models.
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Import our model implementations
from logistic import run_logistic_regression


In [2]:

def load_data(file_path, target_column=None):
    """
    Load data from a CSV file
    
    Args:
        file_path: Path to the CSV file
        target_column: Name of the target column (if None, assumes last column is target)
        
    Returns:
        tuple: (X, y) features and target
    """
    # Load the data
    try:
        data = pd.read_csv(file_path)
        print(f"Successfully loaded data with shape: {data.shape}")
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None
    
    # If target column is not specified, use the last column
    if target_column is None:
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]
    else:
        X = data.drop(target_column, axis=1)
        y = data[target_column]
    
    return X, y


In [3]:

def preprocess_data(X, y, test_size=0.2, random_state=42, scale=True):
    """
    Preprocess the data: split into train/test sets and optionally scale
    
    Args:
        X: Features
        y: Target
        test_size: Proportion of data to use for testing
        random_state: Random seed for reproducibility
        scale: Whether to standardize features
        
    Returns:
        tuple: (X_train, X_test, y_train, y_test)
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Scale features if requested
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test


In [4]:

def compare_models(X_train, X_test, y_train, y_test, models=['logistic']):
    """
    Compare the performance of different models
    
    Args:
        X_train: Training features
        X_test: Test features
        y_train: Training labels
        y_test: Test labels
        models: List of model names to compare
        
    Returns:
        dict: Dictionary of results for each model
    """
    results = {}
    
    # Run models based on the list provided
    for model_name in models:
        if model_name.lower() == 'logistic':
            print("Running Logistic Regression...")
            model, model_results = run_logistic_regression(X_train, y_train, X_test, y_test)
            results['logistic'] = model_results
            
        # Add other models here as needed, e.g.:
        # elif model_name.lower() == 'svm':
        #     model, model_results = run_svm(X_train, y_train, X_test, y_test)
        #     results['svm'] = model_results
        
    return results


In [5]:

def visualize_results(results):
    """
    Visualize the results from different models
    
    Args:
        results: Dictionary of results for each model
    """
    # Bar chart for accuracy, precision, recall, f1
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    models = list(results.keys())
    
    metrics_data = {metric: [results[model][metric] for model in models] for metric in metrics}
    
    # Create the plot
    plt.figure(figsize=(12, 6))
    bar_width = 0.2
    x = np.arange(len(models))
    
    for i, metric in enumerate(metrics):
        plt.bar(x + i*bar_width, metrics_data[metric], width=bar_width, label=metric)
    
    plt.xlabel('Models')
    plt.ylabel('Score')
    plt.title('Model Performance Comparison')
    plt.xticks(x + bar_width*1.5, models)
    plt.legend()
    plt.tight_layout()
    
    # Save the plot
    plt.savefig('model_comparison.png')
    print("Visualization saved as 'model_comparison.png'")
    
    # For each model, plot confusion matrix
    for model_name, model_results in results.items():
        plt.figure(figsize=(8, 6))
        sns.heatmap(model_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_{model_name}.png')
        print(f"Confusion matrix for {model_name} saved")


In [6]:

def main(data_path, target_column=None):
    """
    Main function to run the entire pipeline
    
    Args:
        data_path: Path to the data file
        target_column: Name of the target column
    """
    # Load data
    X, y = load_data(data_path, target_column)
    
    if X is None or y is None:
        print("Failed to load data. Exiting.")
        return
    
    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(X, y)
    
    # Run models
    model_results = compare_models(X_train, X_test, y_train, y_test, models=['logistic'])
    
    # Visualize results
    visualize_results(model_results)
    
    print("Analysis complete!")


In [None]:
import argparse
    
parser = argparse.ArgumentParser(description='Compare machine learning models')
parser.add_argument('data_path', type=str, help='Path to the CSV data file')
parser.add_argument('--target', type=str, default=None, 
                    help='Name of the target column (default: last column)')

args = parser.parse_args()

main(args.data_path, args.target)
