**Table 1**
---

In [48]:
import pandas as pd


def format_continuous(grp_data):
    # Drop -1 values before calculating stats
    grp_data = grp_data[grp_data != -1]
    mean = grp_data.mean()
    std = grp_data.std()
    median = grp_data.median()
    q1 = grp_data.quantile(0.25)
    q3 = grp_data.quantile(0.75)
    return f"{mean:.1f} ({std:.1f}); {median:.1f} [{q1:.1f}-{q3:.1f}]"


def summarize_table1_with_n(df, groupby_col, continuous, categorical):
    df = df.copy()
    groups = df[groupby_col].dropna().unique()
    group_sizes = df[groupby_col].value_counts().to_dict()

    table = {}

    for col in continuous:
        row = {}
        for grp in groups:
            grp_data = df[df[groupby_col] == grp][col]
            row[grp] = format_continuous(grp_data)
        table[col] = row

    for col in categorical:
        cats = df[col].dropna().unique()
        for cat in cats:
            row = {}
            for grp in groups:
                subset = df[df[groupby_col] == grp]
                count = (subset[col] == cat).sum()
                total = subset[col].isin([cat]).sum() + (subset[col] != cat).sum()  # include valid only
                pct = (count / total * 100) if total else 0
                row[grp] = f"{count} ({pct:.1f}%)"
            table[f"{col} = {cat}"] = row

    df_out = pd.DataFrame.from_dict(table, orient='index')
    group_names = {
        0: "TB Negative",
        1: "TB Positive"
    }
    df_out.columns = [f"{group_names.get(grp, grp)} (N={group_sizes.get(grp, 0)})" for grp in df_out.columns]

    return df_out


df_main = pd.read_csv('/Users/tylerstepaniak/Desktop/Jupyter Area/capstone-project/df_main_normalized.csv')
table1_df = summarize_table1_with_n(df_main, 'TB Status', ['Age', 'BMI'], ['HIV Status'])


# renaming rows
table1_df = table1_df.rename(index={
    'Age': 'Age (years)',
    'BMI': 'Body Mass Index (kg/m²)',
    'HIV Status = 1': 'HIV Positive',
    'HIV Status = 0': 'HIV Negative',
    'HIV Status = -1': 'HIV Status Unknown'
})


# showing format
summary_labels = {
    'Age (years)': 'mean (SD); median [IQR]',
    'Body Mass Index (kg/m²)': 'mean (SD); median [IQR]',
    'HIV Positive': 'n (%)',
    'HIV Negative': 'n (%)',
    'HIV Status Unknown': 'n (%)'
}
summary_stat_col = [summary_labels.get(var, '') for var in table1_df.index]
table1_df["Summary Statistic"] = summary_stat_col  # FIX ME should I add this at the end or between row labels and first column?


# outputting
# not doing p value because table1 is descriptive. Can go back and add this in later.
table1_df.to_excel("output_table1.xlsx")
table1_df

Unnamed: 0,TB Positive (N=156),TB Negative (N=76),Summary Statistic
Age (years),34.7 (10.6); 34.0 [26.0-42.0],39.3 (12.2); 39.0 [30.8-48.2],mean (SD); median [IQR]
Body Mass Index (kg/m²),19.9 (3.7); 19.3 [17.5-21.5],23.6 (4.6); 23.0 [20.9-25.0],mean (SD); median [IQR]
HIV Negative,80 (51.3%),44 (57.9%),n (%)
HIV Positive,67 (42.9%),32 (42.1%),n (%)
HIV Status Unknown,9 (5.8%),0 (0.0%),n (%)
