<p align="left">
  <a href="https://colab.research.google.com/github/YOUR_GITHUB_REPO/norm_correlation.ipynb" target="_blank">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
  </a>
</p>

# 📊 Antimicrobial and Cytotoxicity Activity: Normality, Correlation, and Visualization Analysis

This notebook uses Google Colab widgets for file uploads and is designed for Colab. For best results, please open it in Google Colab.

# 1. 📁 Data Upload

Upload the CSV dataset. The file should contain numeric columns for analysis. The first row must be headers.

In [None]:
from google.colab import files
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from scipy import stats
import os

# File upload
uploaded = files.upload()
if not uploaded:
    raise ValueError("No file uploaded. Please upload a CSV file.")
filename = next(iter(uploaded))
try:
    df = pd.read_csv(io.BytesIO(uploaded[filename]))
except Exception as e:
    raise ValueError(f"Error reading CSV: {e}")
df.head()
# Show missing values per column
display(df.isnull().sum())

#2. 📊 Descriptive Statistics
Get an overview of the numeric columns in the dataset.

In [None]:
numeric_cols_desc = df.select_dtypes(include=np.number).columns
if not numeric_cols_desc.empty:
  display(df[numeric_cols_desc].describe())
else:
  print("No numeric columns found for descriptive statistics.")

# 2b. 📦 MIC and IC50 Boxplots

Visualize the distribution of MIC and IC50 values for each bacterial species and cell lines.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# --- Step 1: Identify MIC and IC50 columns for each species ---
mic_cols = [col for col in df.columns if "MIC" in col and "μM" in col]
ic50_cols = [col for col in df.columns if "IC50" in col and "μM" in col]

# Extract species names for labels
def extract_species(col_list):
    return [col.split(" MIC")[0].split(" IC50")[0] for col in col_list]

mic_species = extract_species(mic_cols)
ic50_species = extract_species(ic50_cols)

def italicize_labels(labels):
    return [r"$\it{" + label + "}$" for label in labels]

# --- Step 2: Prepare log-transformed data ---
mic_data_log = [np.log10(df[col].dropna() + 1e-8) for col in mic_cols]
ic50_data_log = [np.log10(df[col].dropna() + 1e-8) for col in ic50_cols]

# --- Step 3: Plot MIC boxplot (log scale, italicized species) ---
plt.figure(figsize=(8,5))
plt.boxplot(mic_data_log)
plt.xticks(range(1, len(mic_cols)+1), italicize_labels(mic_species), fontsize=12)
plt.title("Distribution of log₁₀ MICs Across Bacterial Species", fontsize=14)
plt.ylabel("log₁₀ MIC (μM)")
plt.tight_layout()
plt.show()

# --- Step 4: Plot IC50 boxplot (log scale, plain species) ---
plt.figure(figsize=(8,5))
plt.boxplot(ic50_data_log)
plt.xticks(range(1, len(ic50_cols)+1), ic50_species, fontsize=12)
plt.title("Distribution of log₁₀ IC50 Values Across Cell Lines", fontsize=14)
plt.ylabel("log₁₀ IC50 (μM)")
plt.tight_layout()
plt.show()

# 2. 🧪 Normality Testing (Shapiro-Wilk)

This section performs the Shapiro-Wilk normality test on each numeric column (except ID columns). It also generates a histogram and Q-Q plot for each variable.

In [None]:
def is_id_column(col):
    return str(col).lower().startswith('id') or str(col).upper() == 'ID'

def normality_test(df, output_dir='normality_plots'):
    os.makedirs(output_dir, exist_ok=True)
    results = []
    numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns if not is_id_column(col)]
    for col in numeric_cols:
        data = df[col].dropna()
        if len(data) > 2:
            try:
                stat, p = stats.shapiro(data)
                results.append({'Variable': col, 'Statistic': stat, 'p-value': p})
                # Plot
                fig, axes = plt.subplots(1, 2, figsize=(10, 4))
                sns.histplot(data, kde=True, ax=axes[0])
                axes[0].set_title(f'Histogram: {col}')
                stats.probplot(data, dist="norm", plot=axes[1])
                axes[1].set_title(f'Q-Q plot: {col}')
                plt.tight_layout()
                plot_path = os.path.join(output_dir, f'{col}_normality.png')
                plt.savefig(plot_path, dpi=150)
                plt.show()
            except Exception as e:
                print(f"Error processing {col}: {e}")
        else:
            print(f"Column {col} does not have enough data for the Shapiro-Wilk test.")
    results_df = pd.DataFrame(results)
    return results_df

normality_results = normality_test(df)
display(normality_results)

# 3. 📈📉 Correlation Analysis (Spearman)

This section computes Spearman correlations for selected columns: those containing "PC_1", "PC_2", "PC_3", "MIC (μM)", or "IC50" in their names.

In [None]:
def select_correlation_columns(df):
    keywords = ["PC_1", "PC_2", "PC_3", "MIC (μM)", "IC50", "Hemolysis (max %)"]
    return [col for col in df.columns if any(k in col for k in keywords)]

def spearman_correlation(df, cols):
    corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
    pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
    for i in cols:
        for j in cols:
            x, y = df[i], df[j]
            mask = x.notna() & y.notna()
            if mask.sum() > 1:
                rho, p = stats.spearmanr(x[mask], y[mask])
                corr_matrix.loc[i, j] = rho
                pval_matrix.loc[i, j] = p
            else:
                corr_matrix.loc[i, j] = np.nan
                pval_matrix.loc[i, j] = np.nan
    return corr_matrix, pval_matrix

corr_cols = select_correlation_columns(df)
if not corr_cols:
    raise ValueError("No columns found for correlation analysis.")
corr_matrix, pval_matrix = spearman_correlation(df, corr_cols)
display(corr_matrix)

# 4. 🔥 Visualization: Correlation Heatmap

The heatmap below shows Spearman correlations. Significant correlations are annotated with asterisks (* p<0.05, ** p<0.01, *** p<0.001). Rows/columns are sorted: "MIC (uM)", "IC50", then "PC_1", "PC_2", "PC_3".

In [None]:
from matplotlib.colors import ListedColormap

# Define custom color palette
custom_colors = [
    '#d73027', '#f46d43', '#fdae61', '#fee08b', '#ffffbf',
    '#d9ef8b', '#a6d96a', '#66bd63', '#1a9850', '#006837'
]
custom_cmap = ListedColormap(custom_colors)

# Helper functions to identify PCs and activities
def is_pc(col):
    return any(pc in col for pc in ["PC_1", "PC_2", "PC_3"])

def is_activity(col):
    # Accept both "MIC (uM)" and "MIC (μM)" spellings
    return (
        ("MIC (uM)" in col or "MIC (μM)" in col)
        or ("IC50" in col and not is_pc(col))
        or ("Hemolysis (max %)" in col)
    )

def sort_corr_labels(labels):
    def sort_key(label):
        if "MIC (uM)" in label or "MIC (μM)" in label:
            return (0, label)
        if "IC50" in label or "Hemolysis (max %)" in label:
            return (1, label)
        if "PC_1" in label:
            return (2, label)
        if "PC_2" in label:
            return (3, label)
        if "PC_3" in label:
            return (4, label)
        return (5, label)
    return sorted(labels, key=sort_key)

def annotate_significance(ax, corr, pvals):
    for i in range(corr.shape[0]):
        for j in range(corr.shape[1]):
            val = corr.iloc[i, j]
            p = pvals.iloc[i, j]
            if pd.isnull(val):
                text = ''
            else:
                if p < 0.001:
                    text = f'{val:.2f}***'
                elif p < 0.01:
                    text = f'{val:.2f}**'
                elif p < 0.05:
                    text = f'{val:.2f}*'
                else:
                    text = f'{val:.2f}'
            fontcolor = 'white' if abs(val) > 0.5 else 'black'
            ax.text(j+0.5, i+0.5, text, ha='center', va='center', color=fontcolor, fontsize=10)

def plot_heatmap(corr_matrix, pval_matrix, filename='correlation_heatmap.png'):
    # Print debug info
    print("corr_matrix index (rows):", list(corr_matrix.index))
    print("corr_matrix columns:", list(corr_matrix.columns))
    if corr_matrix.empty or corr_matrix.isnull().all().all():
        print("The correlation DataFrame is empty or all values are NaN. Please check your data filtering and input.")
        return

    # Sort labels for grouping
    labels = sort_corr_labels(list(corr_matrix.index))
    corr_matrix = corr_matrix.loc[labels]
    pval_matrix = pval_matrix.loc[labels]

    plt.figure(figsize=(7, 6))
    ax = sns.heatmap(
        corr_matrix,
        annot=False,
        cmap="RdYlGn",
        center=0,
        vmin=-1, vmax=1,
        cbar_kws={'label': "Spearman's Rho", 'shrink': 0.3},
        square=True,
        linewidths=0.5
    )
    annotate_significance(ax, corr_matrix, pval_matrix)
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(rotation=0, fontsize=10)
    plt.xlabel('Principal Components', fontweight="bold", fontsize=10)
    plt.ylabel('Biological Activities', fontweight="bold", fontsize=10, labelpad=40)

    # --- Add vertical lines and sublabels for groupings ---
    # Count bacteria as those with "MIC (uM)" or "MIC (μM)" in label
    bacteria_labels = [l for l in labels if ("MIC (uM)" in l or "MIC (μM)" in l)]
    n_bacteria = len(bacteria_labels)
    n_cytotox = corr_matrix.shape[0] - n_bacteria  # Cytotoxicity group includes Hemolysis

    x_line = -3

    # "Bacteria" group (first n_bacteria rows)
    if n_bacteria > 0:
        y_bacteria_start = 0.3
        y_bacteria_end = n_bacteria - 0.3
        ax.plot([x_line, x_line], [y_bacteria_start, y_bacteria_end], color='black', linewidth=1, clip_on=False)
        ax.plot([x_line-0.1, x_line+0.1], [y_bacteria_start, y_bacteria_start], color='black', linewidth=1, clip_on=False)
        ax.plot([x_line-0.1, x_line+0.1], [y_bacteria_end, y_bacteria_end], color='black', linewidth=1, clip_on=False)
        plt.text(x_line-0.2, (y_bacteria_start + y_bacteria_end)/2, "Antimicrobial", rotation=90, fontsize=10, va='center', ha='center')

    # "Cytotoxicity" group (remaining rows, including Hemolysis)
    if n_cytotox > 0:
        y_cyto_start = n_bacteria + 0.3
        y_cyto_end = n_bacteria + n_cytotox - 0.3
        ax.plot([x_line, x_line], [y_cyto_start, y_cyto_end], color='black', linewidth=1, clip_on=False)
        ax.plot([x_line-0.1, x_line+0.1], [y_cyto_start, y_cyto_start], color='black', linewidth=1, clip_on=False)
        ax.plot([x_line-0.1, x_line+0.1], [y_cyto_end, y_cyto_end], color='black', linewidth=1, clip_on=False)
        plt.text(x_line-0.2, (y_cyto_start + y_cyto_end)/2, "Cytotoxicity", rotation=90, fontsize=10, va='center', ha='center')

    # --- Italicize only bacteria species names in y-tick labels ---
    bacteria_species = ["E. coli", "K. pneumoniae", "P. aeruginosa", "S. aureus"]
    yticklabels = [label.get_text() for label in ax.get_yticklabels()]
    new_labels = []
    for label in yticklabels:
        for species in bacteria_species:
            if species in label:
                label = label.replace(species, r"$\it{" + species + "}$")
        new_labels.append(label)
    ax.set_yticklabels(new_labels, fontsize=10)

    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.show()

# Select columns for X (PCs) and Y (activities)
all_cols = list(corr_matrix.columns)
pc_cols = [col for col in all_cols if is_pc(col)]
activity_cols = [col for col in all_cols if is_activity(col)]

# Only keep those that actually exist in the matrix
activity_cols = [col for col in activity_cols if col in corr_matrix.index]
pc_cols = [col for col in pc_cols if col in corr_matrix.columns]

# Filter the matrices
corr_filtered = corr_matrix.loc[activity_cols, pc_cols]
pval_filtered = pval_matrix.loc[activity_cols, pc_cols]

# Plot heatmap
plot_heatmap(corr_filtered, pval_filtered)

# 5. 💾 Export Results

Download the normality test results, correlation matrix, and heatmap image.

In [None]:
# Save and offer downloads
normality_results.to_csv('normality_results.csv', index=False)
corr_matrix.to_csv('spearman_correlation_matrix.csv')
pval_matrix.to_csv('spearman_pvalues_matrix.csv')
from google.colab import files
files.download('normality_results.csv')
files.download('spearman_correlation_matrix.csv')
files.download('spearman_pvalues_matrix.csv')
files.download('correlation_heatmap.png')

# 6. 📖 Summary & Usage Instructions

This notebook provides a complete workflow for normality testing and correlation analysis:

- **Data Upload:** Upload your CSV file with numeric columns.
- **Normality Testing:** Shapiro-Wilk test for each variable, with plots.
- **Correlation Analysis:** Spearman correlation for selected variables ("MIC (uM)", "IC50", "PC_1", "PC_2", "PC_3").
- **Visualization:** Heatmap with significance annotations.
- **Export:** Download all results and plots.

**Tip:** For best results, use Google Colab. If you encounter errors, check your file format and column names.