# Volcano plots from ANCOMBC-2 differential abundance

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import glob
from adjustText import adjust_text


In [6]:
# Set up
input_dir = "../Data/Differential_Abundance"
outdir = "../Figures/Supplementary/Revisions/"
os.makedirs(outdir, exist_ok=True)

# Find all *_ancombc2_all.tsv files (Umtata, Cape Town, etc.)
files = glob.glob(os.path.join(input_dir, "*_ancombc2_all.tsv"))

In [7]:
def plot_volcano(df, feature_col, lfc_col, p_col, title, outfile, 
                 comparison_type="",  # NEW: to determine correct color mapping
                 lfc_thresh=1.0, p_thresh=0.05):
    """
    Make a volcano plot with annotated significant features using consistent colors
    
    Parameters:
    -----------
    comparison_type : str
        Type of comparison to determine color mapping:
        - "ADNL_vs_H": ADNL (positive LFC) = light orange, H (negative LFC) = light blue
        - "ADL_vs_H": ADL (positive LFC) = dark salmon, H (negative LFC) = light blue  
        - "ADL_vs_ADNL": ADL (positive LFC) = dark salmon, ADNL (negative LFC) = light orange
    lfc_thresh : float
        Log-fold change threshold (default 1.0)
    p_thresh : float
        P-value threshold (default 0.05)
    """
    
    # Define consistent colors for each group
    COLOR_H = "#87CEEB"      # light blue (sky blue)
    COLOR_ADNL = "#FFB366"   # light orange
    COLOR_ADL = "#FA8072"    # darker salmon
    
    # Map colors based on comparison type
    if "case-nonlesional_skin" in comparison_type or "ADNL_vs_H" in comparison_type:
        # Positive LFC = ADNL, Negative LFC = H
        up_color = COLOR_ADNL
        down_color = COLOR_H
        up_label = "ADNL"
        down_label = "H"
    elif "case-lesional_skin" in comparison_type or "ADL_vs_H" in comparison_type:
        # Positive LFC = ADL, Negative LFC = H
        up_color = COLOR_ADL
        down_color = COLOR_H
        up_label = "ADL"
        down_label = "H"
    elif "ADL_vs_ADNL" in comparison_type:
        # Positive LFC = ADL, Negative LFC = ADNL
        up_color = COLOR_ADL
        down_color = COLOR_ADNL
        up_label = "ADL"
        down_label = "ADNL"
    
    d = df[[feature_col, lfc_col, p_col]].dropna().copy()

    # Clean feature names: remove 'g__' and remove all underscores
    d[feature_col] = (
        d[feature_col]
        .str.replace("g__", "", regex=False)
        .str.replace("_", " ", regex=False)
    )

    d["neg_log10_p"] = -np.log10(d[p_col])
    d["sig"] = "ns"
    # Only mark as significant if BOTH thresholds are passed
    d.loc[(d[p_col] < p_thresh) & (d[lfc_col] > lfc_thresh), "sig"] = "up"
    d.loc[(d[p_col] < p_thresh) & (d[lfc_col] < -lfc_thresh), "sig"] = "down"

    # Color palette
    palette = {
        "up": up_color,
        "down": down_color,
        "ns": "lightgrey"
    }

    plt.figure(figsize=(6, 6))
    ax = sns.scatterplot(
        data=d,
        x=lfc_col,
        y="neg_log10_p",
        hue="sig",
        hue_order=["up", "down", "ns"],  # Explicit order
        palette=palette,
        alpha=0.8
    )

    # Annotate significant taxa (only those passing both thresholds)
    texts = []
    for _, row in d[d["sig"] != "ns"].iterrows():
        texts.append(ax.text(
            row[lfc_col],
            row["neg_log10_p"],
            str(row[feature_col]),
            fontsize=8,
            color="black"
        ))

    # Adjust text positions to avoid ALL overlap + add thin connector lines
    if texts:
        adjust_text(
            texts,
            ax=ax,
            expand_points=(2.0, 2.0),
            expand_text=(2.0, 2.0),
            force_points=2.0,
            force_text=2.0,
            only_move={'points': 'xy', 'text': 'xy'},
            lim=5000,
            arrowprops=dict(
                arrowstyle="-",        # straight line, no arrowhead
                color="grey",
                lw=0.5,                # thin line
                alpha=0.6
            )
        )

    # Add threshold lines
    for x in [-lfc_thresh, lfc_thresh]:
        plt.axvline(x, ls="--", color="grey", lw=1)
    plt.axhline(-np.log10(p_thresh), ls="--", color="grey", lw=1)

    # Label axes and title
    plt.xlabel("Log₂ Fold Change", fontsize=12)
    plt.ylabel("-log₁₀(p-value)", fontsize=12)
    plt.title(title, fontsize=14)

    # Create custom legend with circle markers
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', markerfacecolor=up_color, 
               markersize=8, alpha=0.8, label=up_label),
        Line2D([0], [0], marker='o', color='w', markerfacecolor=down_color, 
               markersize=8, alpha=0.8, label=down_label)
    ]
    
    ax.legend(
        handles=legend_elements,
        title="DA Association",
        loc="upper right",
        frameon=False,
        fontsize=9,
        title_fontsize=10
    )

    plt.tight_layout()
    plt.savefig(outfile, dpi=600, bbox_inches="tight", format="jpg")
    plt.close()
    print(f"Saved: {outfile}")

In [8]:
# ---------------------------------------------------------
# Iterate over all regions (Umtata, Cape Town, etc.)
# ---------------------------------------------------------
for fpath in files:
    region_name = os.path.basename(fpath).replace("_ancombc2_all.tsv", "")
    print(f"\nProcessing {region_name}...")

    df = pd.read_csv(fpath, sep="\t")

    # Compute contrasts
    df["lfc_case-lesional_vs_control-nonlesional"] = (
        df.get("lfc_case_typecase-lesional_skin", 0)
        - df.get("lfc_case_typecontrol-nonlesional_skin", 0)
    )
    df["lfc_case-nonlesional_vs_control-nonlesional"] = (
        df.get("lfc_case_typecase-nonlesional_skin", 0)
        - df.get("lfc_case_typecontrol-nonlesional_skin", 0)
    )
    df["lfc_case-lesional_vs_case-nonlesional"] = (
        df.get("lfc_case_typecase-lesional_skin", 0)
        - df.get("lfc_case_typecase-nonlesional_skin", 0)
    )

    # p-values
    df["p_case-lesional_vs_control-nonlesional"] = df.get("p_case_typecase-lesional_skin", np.nan)
    df["p_case-nonlesional_vs_control-nonlesional"] = df.get("p_case_typecase-nonlesional_skin", np.nan)
    df["p_case-lesional_vs_case-nonlesional"] = df.get("p_case_typecase-lesional_skin", np.nan)

    # Identify feature name column
    feature_col = "taxon" if "taxon" in df.columns else df.columns[0]

    # Remove g__ASV-# and g___ASV-# features
    mask_valid = ~df[feature_col].str.match(r"^g___?ASV-\d+$")
    df = df[mask_valid].copy()

    # ---------------------------------------------------------
    # SAVE ONLY ADL_vs_H plots
    # ---------------------------------------------------------

    # Umtata → Suppl_7A.jpg
    if "Umtata" in region_name:
        outfile = "../Figures/Supplementary/Suppl_Fig_7A.jpg"

    # Cape Town → Suppl_7B.jpg
    elif "Cape" in region_name or "CapeTown" in region_name:
        outfile = "../Figures/Supplementary/Suppl_Fig_7B.jpg"

    else:
        outfile = os.path.join(outdir, f"Suppl_{region_name}_ADL_vs_H.jpg")

    # ADL vs H
    plot_volcano(
        df, feature_col,
        "lfc_case-lesional_vs_control-nonlesional",
        "p_case-lesional_vs_control-nonlesional",
        f"{region_name}: ADL vs H",
        outfile,
        comparison_type="ADL_vs_H"
    )

    # ---------------------------------------------------------
    # COMMENT OUT OTHER PLOTS
    # ---------------------------------------------------------

    """
    # ADNL vs H
    plot_volcano(
        df, feature_col,
        "lfc_case-nonlesional_vs_control-nonlesional",
        "p_case-nonlesional_vs_control-nonlesional",
        f"{region_name}: ADNL vs H",
        os.path.join(outdir, f"Suppl_{region_name}_volcano_ADNL_vs_H_skin.jpg"),
        comparison_type="ADNL_vs_H"
    )

    # ADL vs ADNL
    plot_volcano(
        df, feature_col,
        "lfc_case-lesional_vs_case-nonlesional",
        "p_case-lesional_vs_case-nonlesional",
        f"{region_name}: ADL vs ADNL",
        os.path.join(outdir, f"Suppl_{region_name}_volcano_ADL_vs_ADNL_skin.jpg"),
        comparison_type="ADL_vs_ADNL"
    )
    """

print("\nAll done!")



Processing Cape Town...
Saved: ../Figures/Supplementary/Suppl_Fig_7B.jpg

Processing Umtata...
Saved: ../Figures/Supplementary/Suppl_Fig_7A.jpg

All done!
