In [None]:
# ============================================================
# Clinical AMR Analysis ‚Äî Global Imports
# ============================================================

# Core data handling
import pandas as pd
import numpy as np

# Statistics
from scipy.stats import chi2_contingency

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
from collections import Counter
import itertools
import warnings

# Display & warnings
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 50)
warnings.filterwarnings("ignore")

print("‚úÖ All libraries loaded successfully.")


‚úÖ All libraries loaded successfully.


# ============================================================
# STEP 1 ‚Äî Clinical AMR data loading & validation
# ============================================================

In [None]:

import pandas as pd
import numpy as np
from google.colab import drive

# ---- MOUNT GOOGLE DRIVE ----
drive.mount("/content/drive")

# ---- LOAD FILE ----
# Load Excel file
file_path = "/content/drive/MyDrive/Gayathri_cleaned.xlsx"
df_raw = pd.read_excel(file_path)

# ---- STANDARDIZE COLUMN NAMES ----
df_raw.columns = (
    df_raw.columns
        .str.strip()
        .str.upper()
        .str.replace(" ", "_")
        .str.replace("/", "_")
)

# ---- RENAME COMMON VARIANTS (safety) ----
df_raw = df_raw.rename(columns={
    "S.NO": "SNO",
    "SAMPLE_TYPE": "SAMPLE_TYPE",
    "AMPICILLIN": "AMP",
    "AMOXYCLAV": "AMC",
    "TICARCILLIN": "TIC",
    "PIPERACILLIN_TAZOBACTAM": "PIP_TAZ",
    "CEFALOTIN": "CEFA",
    "CEFOXITIN": "CX",
    "CEFIXIME": "CFM",
    "CEFTAZIDIME": "CAZ",
    "CEFTRIAXONE": "CTRI",
    "ERTAPENAM": "ERT",
    "AMIKACIN": "AK",
    "GENTAMICIN": "GEN",
    "NALIDIXIC_ACID": "NA",
    "CIPROFLOXACIN": "CIP",
    "NORFLOXACIN": "NOR",
    "OFLOXACIN": "OF",
    "FOSFOMYCIN": "FOS",
    "NITROFURANTOIN": "NIT",
    "MAR": "MAR_INDEX"
})

# ---- IDENTIFY METADATA VS ANTIBIOTICS ----
metadata_cols = [
    "SNO", "SAMPLE_TYPE", "GENDER", "ESBL", "MDR", "MAR_INDEX"
]

antibiotic_cols = [
    c for c in df_raw.columns
    if c not in metadata_cols
]

# ---- NORMALIZE S / I / R VALUES ----
valid_vals = {"S", "I", "R"}

for col in antibiotic_cols:
    df_raw[col] = (
        df_raw[col]
            .astype(str)
            .str.strip()
            .str.upper()
            .replace({"": np.nan})
    )

    if not df_raw[col].dropna().isin(valid_vals).all():
        print(f"‚ö†Ô∏è Warning: unexpected values in {col}")

# ---- CLEAN METADATA ----
df_raw["GENDER"] = df_raw["GENDER"].astype(str).str.upper()
df_raw["ESBL"] = df_raw["ESBL"].astype(str).str.upper()
df_raw["MDR"] = df_raw["MDR"].astype(str).str.upper()
df_raw["MAR_INDEX"] = pd.to_numeric(df_raw["MAR_INDEX"], errors="coerce")

# ---- FINAL CANONICAL DATAFRAME ----
df = df_raw.copy()

print("‚úÖ AMR dataset loaded and validated")
print(f"Total isolates: {df.shape[0]}")
print(f"Antibiotics tested: {len(antibiotic_cols)}")

print("\nAntibiotics detected:")
print(antibiotic_cols)

display(df.head())


Mounted at /content/drive
‚úÖ AMR dataset loaded and validated
Total isolates: 218
Antibiotics tested: 19

Antibiotics detected:
[nan, 'AMP', 'AMC', 'TIC', 'PIP_TAZ', 'CEFA', 'CX', 'CFM', 'CAZ', 'CTRI', 'ERT', 'AK', 'GEN', 'NA', 'CIP', 'NOR', 'OF', 'FOS', 'NIT']


Unnamed: 0,NaN,SNO,SAMPLE_TYPE,GENDER,ESBL,AMP,AMC,TIC,PIP_TAZ,CEFA,CX,CFM,CAZ,CTRI,ERT,AK,GEN,NA,CIP,NOR,OF,FOS,NIT,MDR,MAR_INDEX
0,1,1,URINE,M,NO,R,I,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,S,YES,0.5
1,2,2,URINE,F,NO,S,S,S,S,S,S,S,S,S,S,S,S,R,I,S,S,S,S,NO,0.055556
2,3,3,URINE,M,NO,R,S,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,R,YES,0.555556
3,4,4,URINE,F,NO,R,I,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,S,YES,0.5
4,5,5,URINE,F,NO,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,NO,0.0


# ============================================================
# STEP 1.1 ‚Äî Fix Excel artifacts (safe cleanup)
# ============================================================

In [None]:
# 1Ô∏è‚É£ Drop unnamed / NaN columns
df = df.loc[:, df.columns.notna()]

# 2Ô∏è‚É£ Clean PIP_TAZ values explicitly
if "PIP_TAZ" in df.columns:
    df["PIP_TAZ"] = (
        df["PIP_TAZ"]
            .astype(str)
            .str.strip()
            .str.upper()
            .replace({
                "CARB YES": "R",   # clinically resistant
                "YES": "R",
                "NO": "S"
            })
    )

# 3Ô∏è‚É£ Final validation check
valid_vals = {"S", "I", "R"}

for col in df.columns:
    if col not in ["SNO", "SAMPLE_TYPE", "GENDER", "ESBL", "MDR", "MAR_INDEX"]:
        bad = df[col].dropna().unique()
        bad = [v for v in bad if v not in valid_vals]
        if bad:
            print(f"‚ö†Ô∏è Remaining unexpected values in {col}: {bad}")

print("‚úÖ Cleanup completed")
display(df.head())

if "PIP_TAZ" in df.columns:
    df["PIP_TAZ"] = (
        df["PIP_TAZ"]
            .replace({"SDD": "I"})
    )

print("‚úÖ SDD values converted to Intermediate (I)")

# Final verification
valid_vals = {"S", "I", "R"}
for col in df.columns:
    if col not in ["SNO", "SAMPLE_TYPE", "GENDER", "ESBL", "MDR", "MAR_INDEX"]:
        bad = df[col].dropna().unique()
        bad = [v for v in bad if v not in valid_vals]
        if bad:
            print(f"‚ö†Ô∏è Unexpected values remain in {col}: {bad}")

display(df.head())

‚ö†Ô∏è Remaining unexpected values in PIP_TAZ: ['SDD']
‚úÖ Cleanup completed


Unnamed: 0,SNO,SAMPLE_TYPE,GENDER,ESBL,AMP,AMC,TIC,PIP_TAZ,CEFA,CX,CFM,CAZ,CTRI,ERT,AK,GEN,NA,CIP,NOR,OF,FOS,NIT,MDR,MAR_INDEX
0,1,URINE,M,NO,R,I,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,S,YES,0.5
1,2,URINE,F,NO,S,S,S,S,S,S,S,S,S,S,S,S,R,I,S,S,S,S,NO,0.055556
2,3,URINE,M,NO,R,S,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,R,YES,0.555556
3,4,URINE,F,NO,R,I,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,S,YES,0.5
4,5,URINE,F,NO,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,NO,0.0


‚úÖ SDD values converted to Intermediate (I)


Unnamed: 0,SNO,SAMPLE_TYPE,GENDER,ESBL,AMP,AMC,TIC,PIP_TAZ,CEFA,CX,CFM,CAZ,CTRI,ERT,AK,GEN,NA,CIP,NOR,OF,FOS,NIT,MDR,MAR_INDEX
0,1,URINE,M,NO,R,I,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,S,YES,0.5
1,2,URINE,F,NO,S,S,S,S,S,S,S,S,S,S,S,S,R,I,S,S,S,S,NO,0.055556
2,3,URINE,M,NO,R,S,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,R,YES,0.555556
3,4,URINE,F,NO,R,I,R,S,R,S,R,S,R,S,S,S,R,R,R,R,S,S,YES,0.5
4,5,URINE,F,NO,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,S,NO,0.0


# ============================================================
# STEP 2 ‚Äî Encode S / I / R and reshape to long format
# ============================================================

In [None]:

# ---- Define encoding ----
sir_encoding = {
    "S": 0.0,
    "I": 0.5,   # includes SDD
    "R": 1.0
}

# ---- Identify antibiotic columns again (safety) ----
metadata_cols = ["SNO", "SAMPLE_TYPE", "GENDER", "ESBL", "MDR", "MAR_INDEX"]
antibiotic_cols = [c for c in df.columns if c not in metadata_cols]

# ---- Encode S / I / R ----
df_encoded = df.copy()

for col in antibiotic_cols:
    df_encoded[col] = df_encoded[col].map(sir_encoding)

# ---- Convert to long format ----
df_long = df_encoded.melt(
    id_vars=metadata_cols,
    value_vars=antibiotic_cols,
    var_name="ANTIBIOTIC",
    value_name="RESISTANCE_SCORE"
)

# ---- Sanity checks ----
print("‚úÖ STEP 2 completed")
print(f"Long-format rows: {df_long.shape[0]}")
print(f"Unique antibiotics: {df_long['ANTIBIOTIC'].nunique()}")

print("\nResistance score distribution:")
display(df_long["RESISTANCE_SCORE"].value_counts().sort_index())

display(df_long.head())



‚úÖ STEP 2 completed
Long-format rows: 3924
Unique antibiotics: 18

Resistance score distribution:


Unnamed: 0_level_0,count
RESISTANCE_SCORE,Unnamed: 1_level_1
0.0,2204
0.5,101
1.0,1619


Unnamed: 0,SNO,SAMPLE_TYPE,GENDER,ESBL,MDR,MAR_INDEX,ANTIBIOTIC,RESISTANCE_SCORE
0,1,URINE,M,NO,YES,0.5,AMP,1.0
1,2,URINE,F,NO,NO,0.055556,AMP,0.0
2,3,URINE,M,NO,YES,0.555556,AMP,1.0
3,4,URINE,F,NO,YES,0.5,AMP,1.0
4,5,URINE,F,NO,NO,0.0,AMP,0.0


# ============================================================
# STEP 3 ‚Äî Antibiotic-wise resistance rates (Plotly)
# ============================================================

In [None]:


import plotly.express as px

# ---- Label resistance states ----
def label_resistance(score):
    if score == 1.0:
        return "Resistant"
    elif score == 0.5:
        return "Intermediate"
    else:
        return "Sensitive"

df_long["RESISTANCE_LABEL"] = df_long["RESISTANCE_SCORE"].apply(label_resistance)

# ---- Calculate percentages per antibiotic ----
resistance_summary = (
    df_long
        .groupby(["ANTIBIOTIC", "RESISTANCE_LABEL"])
        .size()
        .reset_index(name="COUNT")
)

# Total isolates per antibiotic
totals = (
    df_long
        .groupby("ANTIBIOTIC")
        .size()
        .reset_index(name="TOTAL")
)

resistance_summary = resistance_summary.merge(totals, on="ANTIBIOTIC")
resistance_summary["PERCENT"] = (
    resistance_summary["COUNT"] / resistance_summary["TOTAL"] * 100
)

# ---- Sort antibiotics by % Resistant (descending) ----
resistant_order = (
    resistance_summary
        .query("RESISTANCE_LABEL == 'Resistant'")
        .sort_values("PERCENT", ascending=False)["ANTIBIOTIC"]
)

# ---- Plotly stacked bar chart ----
fig = px.bar(
    resistance_summary,
    x="ANTIBIOTIC",
    y="PERCENT",
    color="RESISTANCE_LABEL",
    category_orders={"ANTIBIOTIC": resistant_order},
    color_discrete_map={
        "Resistant": "#d62728",
        "Intermediate": "#ff7f0e",
        "Sensitive": "#2ca02c"
    },
    title="Antibiotic-wise Resistance Profile (%)",
    labels={"PERCENT": "Percentage of Isolates (%)", "ANTIBIOTIC": "Antibiotic"}
)

fig.update_layout(
    xaxis_tickangle=-45,
    yaxis_range=[0, 100],
    legend_title="Susceptibility",
    template="plotly_white"
)

fig.show()

# ---- Display summary table ----
print("üìä Resistance summary table:")
display(
    resistance_summary
        .pivot(index="ANTIBIOTIC", columns="RESISTANCE_LABEL", values="PERCENT")
        .fillna(0)
        .round(2)
)



üìä Resistance summary table:


RESISTANCE_LABEL,Intermediate,Resistant,Sensitive
ANTIBIOTIC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,0.0,3.67,96.33
AMC,16.51,26.61,56.88
AMP,1.83,75.23,22.94
CAZ,1.38,34.86,63.76
CEFA,2.75,70.18,27.06
CFM,0.0,68.35,31.65
CIP,7.34,55.96,36.7
CTRI,0.0,62.84,37.16
CX,5.05,22.94,72.02
ERT,0.0,10.09,89.91


# ============================================================
# STEP 4A ‚Äî MDR & ESBL prevalence (Plotly)
# ============================================================

In [None]:
import plotly.express as px

# ---- Normalize MDR / ESBL labels ----
df_pop = df.copy()

df_pop["MDR"] = df_pop["MDR"].str.upper().replace({"YES": "MDR+", "NO": "MDR-"})
df_pop["ESBL"] = df_pop["ESBL"].str.upper().replace({"YES": "ESBL+", "NO": "ESBL-"})

# ---- MDR prevalence ----
mdr_counts = df_pop["MDR"].value_counts().reset_index()
mdr_counts.columns = ["STATUS", "COUNT"]

fig_mdr = px.pie(
    mdr_counts,
    names="STATUS",
    values="COUNT",
    hole=0.4,
    color="STATUS",
    color_discrete_map={"MDR+": "#d62728", "MDR-": "#2ca02c"},
    title="MDR Prevalence"
)

fig_mdr.update_traces(
    textinfo="percent+label",
    pull=[0.05, 0]  # slightly highlight MDR+
)

fig_mdr.update_layout(template="plotly_white")
fig_mdr.show()

# ---- ESBL prevalence ----
esbl_counts = df_pop["ESBL"].value_counts().reset_index()
esbl_counts.columns = ["STATUS", "COUNT"]
esbl_counts["PERCENT"] = esbl_counts["COUNT"] / esbl_counts["COUNT"].sum() * 100

fig_esbl = px.bar(
    esbl_counts,
    x="STATUS",
    y="PERCENT",
    text=esbl_counts["PERCENT"].round(1),
    color="STATUS",
    color_discrete_map={"ESBL+": "#9467bd", "ESBL-": "#7f7f7f"},
    title="ESBL Prevalence (%)"
)

fig_esbl.update_layout(
    yaxis_range=[0, 100],
    showlegend=False,
    template="plotly_white"
)

fig_esbl.show()

# ---- Optional: MDR by Gender ----
mdr_gender = (
    df_pop
        .groupby(["GENDER", "MDR"])
        .size()
        .reset_index(name="COUNT")
)

mdr_gender["PERCENT"] = (
    mdr_gender["COUNT"] /
    mdr_gender.groupby("GENDER")["COUNT"].transform("sum") * 100
)

fig_gender = px.bar(
    mdr_gender,
    x="GENDER",
    y="PERCENT",
    color="MDR",
    barmode="group",
    title="MDR Prevalence by Gender (%)",
    labels={"PERCENT": "Percentage (%)"},
    color_discrete_map={"MDR+": "#d62728", "MDR-": "#2ca02c"}
)

fig_gender.update_layout(template="plotly_white")
fig_gender.show()

# ---- Print key numbers ----
print("üìå Key population-level statistics")
print(
    f"Total isolates: {df_pop.shape[0]}\n"
    f"MDR prevalence: {(df_pop['MDR']=='MDR+').mean()*100:.1f}%\n"
    f"ESBL prevalence: {(df_pop['ESBL']=='ESBL+').mean()*100:.1f}%"
)

üìå Key population-level statistics
Total isolates: 218
MDR prevalence: 73.4%
ESBL prevalence: 25.2%


# ============================================================
# STEP 5A ‚Äî Co-resistance pattern analysis
# ============================================================

In [None]:
# ---- Prepare binary resistance matrix (R vs non-R) ----
# Resistant = 1, Sensitive/Intermediate = 0
df_binary = df_encoded.copy()

for col in antibiotic_cols:
    df_binary[col] = (df_binary[col] == 1.0).astype(int)

# ---- Compute correlation matrix ----
corr_matrix = df_binary[antibiotic_cols].corr()

# ---- Plot heatmap ----
fig_corr = px.imshow(
    corr_matrix,
    text_auto=".2f",
    color_continuous_scale="RdBu_r",
    zmin=-1,
    zmax=1,
    title="Antibiotic Co-Resistance Correlation Heatmap"
)

fig_corr.update_layout(
    xaxis_title="Antibiotic",
    yaxis_title="Antibiotic",
    template="plotly_white"
)

fig_corr.show()

# ---- Identify strongest co-resistance pairs ----
corr_pairs = (
    corr_matrix
        .where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        .stack()
        .reset_index()
)

corr_pairs.columns = ["Antibiotic_1", "Antibiotic_2", "Correlation"]

top_pairs = corr_pairs.sort_values("Correlation", ascending=False).head(10)

print("üîó Top co-resistance pairs:")
display(top_pairs)

üîó Top co-resistance pairs:


Unnamed: 0,Antibiotic_1,Antibiotic_2,Correlation
1,AMP,TIC,0.987842
63,CEFA,CFM,0.95781
147,NOR,OF,0.953741
144,CIP,OF,0.944446
143,CIP,NOR,0.935432
88,CFM,CTRI,0.885012
65,CEFA,CTRI,0.847674
5,AMP,CFM,0.820379
3,AMP,CEFA,0.81068
36,TIC,CFM,0.808189


# ============================================================
# STEP 5B ‚Äî ESBL vs non-ESBL resistance comparison
# ============================================================

In [None]:
# ---- Prepare data (long format, resistant only) ----
df_esbl = df_long.copy()

# Label ESBL status
df_esbl["ESBL_STATUS"] = df_esbl["ESBL"].str.upper().replace({
    "ESBL+": "ESBL+",
    "YES": "ESBL+",
    "ESBL-": "ESBL-",
    "NO": "ESBL-"
})

# Keep only Resistant vs Non-resistant
df_esbl["IS_RESISTANT"] = (df_esbl["RESISTANCE_SCORE"] == 1.0).astype(int)

# ---- Calculate resistance % per antibiotic per ESBL group ----
esbl_summary = (
    df_esbl
        .groupby(["ANTIBIOTIC", "ESBL_STATUS"])["IS_RESISTANT"]
        .mean()
        .reset_index()
)

esbl_summary["PERCENT_RESISTANT"] = esbl_summary["IS_RESISTANT"] * 100

# ---- Compute difference (ESBL+ minus ESBL-) ----
diff_table = (
    esbl_summary
        .pivot(index="ANTIBIOTIC", columns="ESBL_STATUS", values="PERCENT_RESISTANT")
        .fillna(0)
)

diff_table["DIFFERENCE_%"] = diff_table.get("ESBL+", 0) - diff_table.get("ESBL-", 0)
diff_table = diff_table.sort_values("DIFFERENCE_%", ascending=False).reset_index()

# ---- Plot grouped bar chart ----
fig_esbl = px.bar(
    esbl_summary,
    x="ANTIBIOTIC",
    y="PERCENT_RESISTANT",
    color="ESBL_STATUS",
    barmode="group",
    title="Antibiotic Resistance in ESBL vs Non-ESBL Isolates",
    labels={"PERCENT_RESISTANT": "% Resistant"},
    color_discrete_map={"ESBL+": "#d62728", "ESBL-": "#2ca02c"}
)

fig_esbl.update_layout(
    xaxis_tickangle=-45,
    yaxis_range=[0, 100],
    template="plotly_white"
)

fig_esbl.show()

# ---- Display difference table ----
print("üìä Resistance difference (ESBL+ minus ESBL-):")
display(diff_table.round(2))

üìä Resistance difference (ESBL+ minus ESBL-):


ESBL_STATUS,ANTIBIOTIC,CARB YES,ESBL+,ESBL-,INT,NAN,YES (CARB),DIFFERENCE_%
0,CAZ,100.0,100.0,0.0,0.0,100.0,100.0,100.0
1,CTRI,100.0,100.0,42.14,100.0,100.0,100.0,57.86
2,CIP,100.0,90.91,35.71,50.0,100.0,100.0,55.19
3,NOR,100.0,89.09,35.0,0.0,100.0,100.0,54.09
4,OF,100.0,89.09,35.71,50.0,100.0,94.74,53.38
5,CFM,100.0,100.0,50.71,100.0,100.0,100.0,49.29
6,CEFA,100.0,100.0,53.57,100.0,100.0,100.0,46.43
7,TIC,100.0,100.0,60.71,100.0,100.0,100.0,39.29
8,AMP,100.0,100.0,61.43,100.0,100.0,100.0,38.57
9,AMC,100.0,47.27,10.0,0.0,0.0,89.47,37.27


# ============================================================
# STEP 5C ‚Äî MDR structure & dominant resistance combinations
# ============================================================

In [None]:
# ------------------------------------------------------------
# Normalize MDR labels (robust)
# ------------------------------------------------------------
df_profiles = df_encoded.copy()

df_profiles["MDR_NORM"] = (
    df_profiles["MDR"]
        .astype(str)
        .str.upper()
        .replace({
            "YES": "MDR+",
            "NO": "MDR-",
            "MDR+": "MDR+",
            "MDR-": "MDR-"
        })
)

# ------------------------------------------------------------
# 1Ô∏è‚É£ Antibiotics driving MDR
# ------------------------------------------------------------
df_mdr_long = df_long[df_long["MDR"] == "MDR+"].copy()

mdr_drivers = (
    df_mdr_long
        .assign(IS_RESISTANT=lambda x: (x["RESISTANCE_SCORE"] == 1.0).astype(int))
        .groupby("ANTIBIOTIC")["IS_RESISTANT"]
        .mean()
        .reset_index()
)

mdr_drivers["PERCENT_RESISTANT"] = mdr_drivers["IS_RESISTANT"] * 100
mdr_drivers = mdr_drivers.sort_values("PERCENT_RESISTANT", ascending=False)

fig_mdr_drivers = px.bar(
    mdr_drivers,
    x="ANTIBIOTIC",
    y="PERCENT_RESISTANT",
    color="PERCENT_RESISTANT",
    color_continuous_scale="Reds",
    title="Antibiotics Driving MDR (Resistance Frequency in MDR+ Isolates)",
    labels={"PERCENT_RESISTANT": "% Resistant among MDR isolates"}
)

fig_mdr_drivers.update_layout(
    xaxis_tickangle=-45,
    yaxis_range=[0, 100],
    template="plotly_white"
)

fig_mdr_drivers.show()

# ------------------------------------------------------------
# 2Ô∏è‚É£ MDR resistance profiles (core result)
# ------------------------------------------------------------
def resistance_profile(row):
    resistant_abx = [
        abx for abx in antibiotic_cols
        if row[abx] == 1.0
    ]
    return ",".join(sorted(resistant_abx)) if resistant_abx else "None"

df_profiles["RESISTANCE_PROFILE"] = df_profiles.apply(resistance_profile, axis=1)

mdr_profiles = (
    df_profiles[df_profiles["MDR_NORM"] == "MDR+"]
        .groupby("RESISTANCE_PROFILE")
        .size()
        .reset_index(name="COUNT")
        .sort_values("COUNT", ascending=False)
        .head(10)
)

print("üî¨ Top MDR resistance profiles (dominant combinations):")
display(mdr_profiles)

# ------------------------------------------------------------
# 3Ô∏è‚É£ Resistance burden comparison (MDR vs non-MDR)
# ------------------------------------------------------------
df_profiles["R_COUNT_COMPUTED"] = df_profiles[antibiotic_cols].sum(axis=1)

fig_burden = px.box(
    df_profiles,
    x="MDR_NORM",
    y="R_COUNT_COMPUTED",
    points="all",
    title="Resistance Burden in MDR vs Non-MDR Isolates",
    labels={"R_COUNT_COMPUTED": "Number of Resistant Antibiotics"},
    color="MDR_NORM",
    color_discrete_map={"MDR+": "#d62728", "MDR-": "#2ca02c"}
)

fig_burden.update_layout(template="plotly_white")
fig_burden.show()

# ------------------------------------------------------------
# 4Ô∏è‚É£ Key MDR structure statistics (text-ready)
# ------------------------------------------------------------
print("üìå MDR structural summary")
print(
    f"Total isolates: {df_profiles.shape[0]}\n"
    f"MDR isolates: {(df_profiles['MDR_NORM']=='MDR+').sum()}\n"
    f"Median resistant antibiotics (MDR+): "
    f"{df_profiles[df_profiles['MDR_NORM']=='MDR+']['R_COUNT_COMPUTED'].median():.0f}\n"
    f"Median resistant antibiotics (MDR-): "
    f"{df_profiles[df_profiles['MDR_NORM']=='MDR-']['R_COUNT_COMPUTED'].median():.0f}"
)

üî¨ Top MDR resistance profiles (dominant combinations):


Unnamed: 0,RESISTANCE_PROFILE,COUNT
45,"AMP,CEFA,CFM,CTRI,NA,TIC",20
40,"AMP,CEFA,CFM,CIP,CTRI,NA,NOR,OF,TIC",16
30,"AMP,CAZ,CEFA,CFM,CIP,CTRI,NA,NOR,OF,TIC",15
38,"AMP,CEFA,CFM,CIP,CTRI,GEN,NA,NOR,OF,TIC",9
6,"AMC,AMP,CAZ,CEFA,CFM,CIP,CTRI,CX,ERT,NA,NOR,OF...",8
27,"AMP,CAZ,CEFA,CFM,CIP,CTRI,GEN,NA,NOR,OF,TIC",8
10,"AMC,AMP,CAZ,CEFA,CFM,CIP,CTRI,CX,NA,NOR,OF,PIP...",6
49,"AMP,CIP,NA,NOR,OF,TIC",5
0,"AK,AMC,AMP,CAZ,CEFA,CFM,CIP,CTRI,CX,ERT,GEN,NA...",5
20,"AMC,AMP,CEFA,CFM,CIP,CX,NA,NOR,OF,TIC",4


üìå MDR structural summary
Total isolates: 218
MDR isolates: 160
Median resistant antibiotics (MDR+): 10
Median resistant antibiotics (MDR-): 1


# ============================================================
# STEP 5D ‚Äî Auto-generated clinical & surveillance summary
# ============================================================

In [None]:
# Basic counts
total_isolates = df.shape[0]
mdr_rate = (df["MDR"].str.upper() == "YES").mean() * 100
esbl_rate = (df["ESBL"].str.upper() == "YES").mean() * 100

high_mar_rate = (df["MAR_INDEX"] > 0.2).mean() * 100
median_mar = df["MAR_INDEX"].median()

# Most resistant antibiotics (top 3)
top_resistant = (
    df_long[df_long["RESISTANCE_SCORE"] == 1.0]
        .groupby("ANTIBIOTIC")
        .size()
        .sort_values(ascending=False)
        .head(3)
        .index
        .tolist()
)

# MDR profile summary
top_profile = mdr_profiles.iloc[0]["RESISTANCE_PROFILE"]
top_profile_count = mdr_profiles.iloc[0]["COUNT"]

# Print summary text
print("üßæ AUTO-GENERATED AMR SURVEILLANCE SUMMARY\n")

print(
    f"A total of {total_isolates} clinical isolates were analyzed. "
    f"Multidrug resistance (MDR) was observed in {mdr_rate:.1f}% of isolates, "
    f"while extended-spectrum beta-lactamase (ESBL) production was detected in "
    f"{esbl_rate:.1f}% of isolates.\n"
)

print(
    f"The median multiple antibiotic resistance (MAR) index was {median_mar:.2f}, "
    f"with {high_mar_rate:.1f}% of isolates exceeding the high-risk MAR threshold (>0.2), "
    f"indicating substantial resistance burden in the population.\n"
)

print(
    f"The antibiotics exhibiting the highest resistance frequencies were "
    f"{', '.join(top_resistant)}. "
    f"MDR isolates demonstrated complex resistance architectures, predominantly "
    f"involving concurrent resistance to multiple beta-lactams and fluoroquinolones.\n"
)

print(
    f"The most frequent MDR resistance profile involved resistance to "
    f"{top_profile}, observed in {top_profile_count} isolates. "
    f"Overall, MDR isolates exhibited a markedly higher resistance burden "
    f"compared to non-MDR isolates, underscoring the need for enhanced "
    f"antimicrobial stewardship and continuous surveillance."
)

üßæ AUTO-GENERATED AMR SURVEILLANCE SUMMARY

A total of 218 clinical isolates were analyzed. Multidrug resistance (MDR) was observed in 73.4% of isolates, while extended-spectrum beta-lactamase (ESBL) production was detected in 25.2% of isolates.

The median multiple antibiotic resistance (MAR) index was 0.50, with 72.9% of isolates exceeding the high-risk MAR threshold (>0.2), indicating substantial resistance burden in the population.

The antibiotics exhibiting the highest resistance frequencies were NA, AMP, TIC. MDR isolates demonstrated complex resistance architectures, predominantly involving concurrent resistance to multiple beta-lactams and fluoroquinolones.

The most frequent MDR resistance profile involved resistance to AMP,CEFA,CFM,CTRI,NA,TIC, observed in 20 isolates. Overall, MDR isolates exhibited a markedly higher resistance burden compared to non-MDR isolates, underscoring the need for enhanced antimicrobial stewardship and continuous surveillance.
