In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, KMeansSMOTE

In [11]:
# Create a custom colormap
colors = [
    (0.945, 0.980, 0.733),  # light yellow-green
    (0.263, 0.671, 0.702),  # teal
    (0.137, 0.294, 0.620),  # navy blue
]

custom_cmap = LinearSegmentedColormap.from_list("custom_diverging", colors, N=256)

# Custom formatter for correlation values
def custom_fmt(val):
    abs_val = abs(val)
    if abs_val == 1.0:
        return "1.0"
    else:
        return f".{int(abs_val * 100):02d}"

# Function to create and save correlation heatmap
def plot_correlation_heatmap(df, title, filename):
    # Drop the target variable for correlation calculation
    if 'performance_class' in df.columns:
        data = df.drop(columns=["performance_class"])
    else:
        data = df
    
    # Calculate correlation matrix
    corr_matrix = data.corr()
    
    # Create figure
    plt.figure(figsize=(12, 12))
    
    # Create heatmap
    heatmap = sns.heatmap(
        corr_matrix,
        mask=np.triu(np.ones_like(corr_matrix, dtype=bool), k=1),
        annot=True,
        fmt="",
        annot_kws={"size": 8},
        cmap=custom_cmap,
        vmin=-1, vmax=1,
        center=0,
        linewidths=0.2,
        cbar_kws={"shrink": 0.8}
    )
    
    # Apply custom formatter
    for text in heatmap.texts:
        text_value = float(text.get_text().replace('−', '-'))
        text.set_text(custom_fmt(text_value))
    
    # Add title
    plt.title(title, fontsize=14)
    
    # Adjust layout
    plt.subplots_adjust(bottom=0.3)
    
    # Save figure
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()
    
    return corr_matrix

# Function to calculate correlation difference metrics
def calculate_correlation_difference(original_corr, augmented_corr):
    # Calculate absolute differences
    diff_matrix = np.abs(original_corr - augmented_corr)
    
    # Calculate mean absolute difference (excluding diagonal)
    mask = ~np.eye(original_corr.shape[0], dtype=bool)
    mean_abs_diff = diff_matrix.values[mask].mean()
    
    # Calculate max absolute difference
    max_abs_diff = diff_matrix.values[mask].max()
    
    return {
        'mean_abs_diff': mean_abs_diff,
        'max_abs_diff': max_abs_diff,
        'diff_matrix': diff_matrix
    }

# Function to plot feature distributions
def plot_feature_distributions(original_df, augmented_df, method_name):
    if 'performance_class' in original_df.columns:
        original_features = original_df.drop(columns=['performance_class'])
        augmented_features = augmented_df.drop(columns=['performance_class'])
    else:
        original_features = original_df
        augmented_features = augmented_df
    
    # Select a subset of features if there are too many
    features_to_plot = original_features.columns[:min(10, len(original_features.columns))]
    
    fig, axes = plt.subplots(len(features_to_plot), 2, figsize=(15, 4*len(features_to_plot)))
    
    for i, feature in enumerate(features_to_plot):
        # Original data distribution
        sns.histplot(original_features[feature], kde=True, color='royalblue', ax=axes[i, 0])
        axes[i, 0].set_title(f"{feature} (Original)")
        
        # Augmented data distribution
        sns.histplot(augmented_features[feature], kde=True, color='orange', ax=axes[i, 1])
        axes[i, 1].set_title(f"{feature} ({method_name})")
    
    plt.tight_layout()
    plt.savefig(f"../data/figures/feature_distributions_{method_name}.pdf", dpi=300)
    plt.close()

In [12]:
# Main function to run the augmentation experiment
def run_augmentation_experiment(input_file):
    # Load dataset
    print("Loading dataset...")
    df = pd.read_csv(input_file)
    X = df.drop(columns=["performance_class"])
    y = df["performance_class"]
    
    # Save original correlation matrix
    original_corr = plot_correlation_heatmap(
        X, 
        "Original Feature Correlation Matrix", 
        "../data/figures/original_feature_correlation_heatmap.pdf"
    )
    
    # Define augmentation methods
    augmentation_methods = {
        
        'SMOTE_k5': SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42),
        'BorderlineSMOTE': BorderlineSMOTE(sampling_strategy='auto', random_state=42),
        'KMeansSMOTE': KMeansSMOTE(sampling_strategy='auto', cluster_balance_threshold=0.1, random_state=42)
    }
    
    results = {}
    augmented_dfs = {}
    
    # Run each augmentation method
    for method_name, augmenter in augmentation_methods.items():
        print(f"\nRunning {method_name}...")
        
        try:
            # Step 1: Initial augmentation to balance classes
            X_resampled_1, y_resampled_1 = augmenter.fit_resample(X, y)
            df_balanced = pd.DataFrame(X_resampled_1, columns=X.columns)
            df_balanced["performance_class"] = y_resampled_1
                        
            print(f"Dataset size after initial {method_name}:", df_balanced.shape)
            print(y_resampled_1.value_counts())
                        
            # Direct large-scale augmentation to reach target size
            target_count_per_class = 5000
            sampling_strategy_large = {}
            for cls in y_resampled_1.unique():
                sampling_strategy_large[cls] = target_count_per_class
                        
            augmenter_params = {}
            if method_name == 'KMeansSMOTE':
                augmenter_params['cluster_balance_threshold'] = 0.1
                        
            if method_name == 'SMOTE_k5':
                augmenter_large = SMOTE(sampling_strategy=sampling_strategy_large, k_neighbors=5, random_state=44, **augmenter_params)
            elif method_name == 'BorderlineSMOTE':
                augmenter_large = BorderlineSMOTE(sampling_strategy=sampling_strategy_large, random_state=44, **augmenter_params)
            elif method_name == 'KMeansSMOTE':
                augmenter_large = KMeansSMOTE(sampling_strategy=sampling_strategy_large, random_state=44, **augmenter_params)
                        
            X_balanced = df_balanced.drop(columns=["performance_class"])
            y_balanced = df_balanced["performance_class"]
            X_resampled_large, y_resampled_large = augmenter_large.fit_resample(X_balanced, y_balanced)
                        
            df_large = pd.DataFrame(X_resampled_large, columns=X.columns)
            df_large["performance_class"] = y_resampled_large
                        
            print(f"Large dataset size after {method_name}:", df_large.shape)
            print(y_resampled_large.value_counts())
                        
            # Save the augmented correlation matrix
            augmented_corr = plot_correlation_heatmap(
                df_large,
                f"{method_name} Augmented Feature Correlation Matrix",
                f"../data/figures/{method_name}_correlation_heatmap.pdf"
            )
                        
            # Calculate correlation difference metrics
            diff_results = calculate_correlation_difference(original_corr, augmented_corr)
                        
            # Save results
            results[method_name] = diff_results
            augmented_dfs[method_name] = df_large
                        
            # Plot the difference matrix
            plt.figure(figsize=(12, 10))
            heatmap = sns.heatmap(
                diff_results['diff_matrix'],
                mask=np.triu(np.ones_like(diff_results['diff_matrix'], dtype=bool), k=1),
                annot=True,
                fmt="",
                annot_kws={"size": 8},
                cmap=custom_cmap,
                vmin=0,
                vmax=0.5
            )
            # Apply custom formatter
            for text in heatmap.texts:
                text_value = float(text.get_text().replace('−', '-'))
                text.set_text(custom_fmt(text_value))
            plt.title(f'Correlation Difference: {method_name}\nMean: {diff_results["mean_abs_diff"]:.4f}, Max: {diff_results["max_abs_diff"]:.4f}')
            plt.tight_layout()
            plt.savefig(f"../data/figures/{method_name}_diff_heatmap.pdf", dpi=300)
            plt.close()
            
        except Exception as e:
            print(f"Error with {method_name}: {str(e)}")
    
    # Save correlation difference metrics to CSV
    metrics_df = pd.DataFrame({
        'Method': list(results.keys()),
        'Mean_Absolute_Difference': [results[method]['mean_abs_diff'] for method in results],
        'Max_Absolute_Difference': [results[method]['max_abs_diff'] for method in results]
    })
    
    metrics_df.to_csv("../data/correlation_difference_metrics.csv", index=False)
    print("\nCorrelation difference metrics saved to 'correlation_difference_metrics.csv'")
    
    # Find the best method
    if results:
        best_method = min(results.items(), key=lambda x: x[1]['mean_abs_diff'])[0]
        print(f"\nBest method based on correlation preservation: {best_method}")
        
        # Plot feature distributions for the best method
        if best_method in augmented_dfs:
            plot_feature_distributions(df, augmented_dfs[best_method], best_method)
            print(f"Feature distribution plot saved for {best_method}")
            
            # Save the best augmented dataset to CSV
            best_df = augmented_dfs[best_method]
            best_df.to_csv("../data/features/augmented_feature_matrix.csv", index=False)
            print(f"Best augmented dataset ({best_method}) saved to 'augmented_feature_matrix.csv'")
    
    return results, augmented_dfs

# Run the experiment
if __name__ == "__main__":
    results, augmented_dfs = run_augmentation_experiment("../data/features/filtered_labeled_feature_matrix.csv")

Loading dataset...

Running SMOTE_k5...
Dataset size after initial SMOTE_k5: (604, 26)
performance_class
0    302
1    302
Name: count, dtype: int64
Large dataset size after SMOTE_k5: (10000, 26)
performance_class
0    5000
1    5000
Name: count, dtype: int64

Running BorderlineSMOTE...
Dataset size after initial BorderlineSMOTE: (604, 26)
performance_class
0    302
1    302
Name: count, dtype: int64
Large dataset size after BorderlineSMOTE: (10000, 26)
performance_class
0    5000
1    5000
Name: count, dtype: int64

Running KMeansSMOTE...
Dataset size after initial KMeansSMOTE: (607, 26)
performance_class
1    305
0    302
Name: count, dtype: int64
Large dataset size after KMeansSMOTE: (10007, 26)
performance_class
0    5004
1    5003
Name: count, dtype: int64

Correlation difference metrics saved to 'correlation_difference_metrics.csv'

Best method based on correlation preservation: SMOTE_k5
Feature distribution plot saved for SMOTE_k5
Best augmented dataset (SMOTE_k5) saved to 'augm