# Proteomics Data Analysis - Part 2: Python Script
## Description:
Loads preprocessed datasets from R, applies basic (mean, median, knn) and advanced imputation (MissForest, DAE), trains ML models (RF, XGBoost, MLP), evaluates performance, and visualizes results.

In [4]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
import logging

In [5]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## Intialization of Global Variables

In [29]:
# Specify paths and global variables
DATA_DIR = "../R"
RESULTS_DIR = "../../results/Python"
LABEL_FILE = "../../data/sample_metadata.csv"
LABEL_COL = "Condition"
Sample_ID_COL = "SampleName"

TEST_SIZE = 0.3
RANDOM_STATE = 42

## Functions

Loading .csv files from directory

In [31]:
# Loads all .csv files in the data directory into pandas DataFrames
def load_data(data_dir):
    data = {}
    logging.info(f"Number of files: {len(os.listdir(data_dir))}")
    for filename in os.listdir(data_dir):
        if filename.endswith(".csv"):
            file_path = os.path.join(data_dir, filename)
            dataset_name = filename.split(".")[0]
            try:
                # Assume first column is the index (protein IDs)
                data[dataset_name] = pd.read_csv(file_path, index_col=0)
            except Exception as e:
                logging.error(f"Error loading {filename}: {e}")
                continue
    logging.info(f"Loaded {len(data)} datasets from {data_dir}")
    return data

Basic Imputation Functions

In [39]:
def impute_mean(df):
    """
    Impute missing values with the mean of each column.
    """
    imputer = SimpleImputer(strategy="mean")
    imputed_data = imputer.fit_transform(df)
    imputed_df = pd.DataFrame(imputed_data, columns=df.columns, index=df.index)
    logging.info("(impute_mean): Imputed missing values with mean")
    return imputed_df

def impute_median(df):
    """
    Impute missing values with the median of each column.
    """
    imputer = SimpleImputer(strategy="median")
    imputed_data = imputer.fit_transform(df)
    imputed_df = pd.DataFrame(imputed_data, columns=df.columns, index=df.index)
    logging.info("(impute_median): Imputed missing values with median")
    return imputed_df

def impute_knn(df, n_neighbors=5):
    """
    Impute missing values using K-Nearest Neighbors.
    """
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputed_data = imputer.fit_transform(df)
    imputed_df = pd.DataFrame(imputed_data, columns=df.columns, index=df.index)
    logging.info("(impute_knn): Imputed missing values using KNN")
    return imputed_df

# Inmputation functions dictionary
impute_func = {
    "mean": impute_mean,
    "median": impute_median,
    "knn": impute_knn
}

## Basic Imputations

In [40]:
# Load the datasets from the directory
datasets = load_data(DATA_DIR)
# Load the sample metadata
metadata = pd.read_csv(LABEL_FILE)


2025-04-08 23:43:56,977 - INFO - Number of files: 10
2025-04-08 23:43:57,242 - INFO - Loaded 10 datasets from ../R


In [42]:
# Run the imputation methods on each dataset
for name, dataset in datasets.items():
    logging.info(f"Processing dataset: {name}")

    # Check if the dataset is empty
    if dataset.empty:
        logging.warning(f"Dataset {name} is empty. Skipping.")
        continue
    
    # Check for missing values
    logging.info(f"Missing values in dataset {name}: {dataset.isnull().sum().sum()}")

    for method_name, impute_method in impute_func.items():
        logging.info(f"Imputing with method: {method_name}")
        # Impute missing values
        imputed_dataset = impute_method(dataset.copy())
        # Save the imputed dataset as a .csv file
        output_path = os.path.join(RESULTS_DIR, f"{name}_{method_name}_imputed.csv")
        # Create directory if it doesn't exist
        if not os.path.exists(RESULTS_DIR):
            os.makedirs(RESULTS_DIR, exist_ok=True)
        imputed_dataset.to_csv(output_path, index=True)
        logging.info(f"Saved imputed dataset to {output_path}")


2025-04-08 23:45:13,745 - INFO - Processing dataset: missing_MAR_10
2025-04-08 23:45:13,747 - INFO - Missing values in dataset missing_MAR_10: 8811
2025-04-08 23:45:13,748 - INFO - Imputing with method: mean
2025-04-08 23:45:13,754 - INFO - (impute_mean): Imputed missing values with mean
2025-04-08 23:45:13,881 - INFO - Saved imputed dataset to ../../results/Python\missing_MAR_10_mean_imputed.csv
2025-04-08 23:45:13,882 - INFO - Imputing with method: median
2025-04-08 23:45:13,894 - INFO - (impute_median): Imputed missing values with median
2025-04-08 23:45:14,010 - INFO - Saved imputed dataset to ../../results/Python\missing_MAR_10_median_imputed.csv
2025-04-08 23:45:14,011 - INFO - Imputing with method: knn
2025-04-08 23:45:16,619 - INFO - (impute_knn): Imputed missing values using KNN
2025-04-08 23:45:16,720 - INFO - Saved imputed dataset to ../../results/Python\missing_MAR_10_knn_imputed.csv
2025-04-08 23:45:16,721 - INFO - Processing dataset: missing_MAR_20
2025-04-08 23:45:16,722

In [43]:
# Plotting the imputed datasets
def plot_imputed_data(imputed_data, method_name, dataset_name):
    plt.figure(figsize=(10, 6))
    sns.heatmap(imputed_data.isnull(), cbar=False, cmap='viridis')
    plt.title(f"Missing Values After Imputation ({method_name}) - {dataset_name}")
    plt.xlabel("Features")
    plt.ylabel("Samples")
    plt.savefig(os.path.join(RESULTS_DIR, f"{dataset_name}_{method_name}_imputed_heatmap.png"))
    plt.close()
    logging.info(f"Saved heatmap for {dataset_name} with method {method_name}")

In [44]:
# Plot the imputed datasets
for name, dataset in datasets.items():
    for method_name, impute_method in impute_func.items():
        imputed_dataset = impute_method(dataset.copy())
        plot_imputed_data(imputed_dataset, method_name, name)

2025-04-08 23:50:35,552 - INFO - (impute_mean): Imputed missing values with mean
2025-04-08 23:50:36,513 - INFO - Saved heatmap for missing_MAR_10 with method mean
2025-04-08 23:50:36,523 - INFO - (impute_median): Imputed missing values with median
2025-04-08 23:50:37,216 - INFO - Saved heatmap for missing_MAR_10 with method median
2025-04-08 23:50:39,511 - INFO - (impute_knn): Imputed missing values using KNN
2025-04-08 23:50:40,191 - INFO - Saved heatmap for missing_MAR_10 with method knn
2025-04-08 23:50:40,196 - INFO - (impute_mean): Imputed missing values with mean
2025-04-08 23:50:40,886 - INFO - Saved heatmap for missing_MAR_20 with method mean
2025-04-08 23:50:40,895 - INFO - (impute_median): Imputed missing values with median
2025-04-08 23:50:41,580 - INFO - Saved heatmap for missing_MAR_20 with method median
2025-04-08 23:50:45,026 - INFO - (impute_knn): Imputed missing values using KNN
2025-04-08 23:50:45,714 - INFO - Saved heatmap for missing_MAR_20 with method knn
2025-04-