In [22]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d')

In [None]:
df = pd.read_csv("/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/data_OnlyAbuse_N1235.csv",index_col=0)

# df["date"] = pd.to_datetime(df["date"])
# df["date"] = df["date"].dt.strftime("%Y-%m-%d")
df["Year"] = pd.to_datetime(df["date"]).dt.year.astype(int)

df = df.drop(columns=['abuse_1',"abuse_num","date","instruction_detail","instruction","memo","dentists","dental_hygienist"])

df.head()

Unnamed: 0,No_All,CGC,sex,age_year,age_month,age,abuse,U17,U16,U15,...,Present_Perm_Teeth,Present_Baby_Teeth,Healthy_Rate,C0_Count,Care_Index,Trauma_Count,RDT_Count,UTN_Score,life_stage,Year
2,3,tokyo,Female,15,1,15.083333,Physical Abuse,0.0,0.0,0.0,...,28.0,0.0,82.1,2.0,100.0,0.0,0.0,0.0,Adolescence,2016
3,4,tokyo,Female,14,5,14.416667,Emotional Abuse,0.0,0.0,0.0,...,28.0,0.0,100.0,0.0,,0.0,0.0,0.0,Adolescence,2016
4,5,tokyo,Female,15,10,15.833333,Neglect,3.0,1.0,0.0,...,28.0,0.0,60.7,3.0,62.5,0.0,0.0,37.5,Adolescence,2016
7,8,tokyo,Female,15,0,15.0,Physical Abuse,0.0,0.0,0.0,...,28.0,0.0,100.0,0.0,,0.0,0.0,0.0,Adolescence,2016
8,9,tokyo,Female,17,0,17.0,Physical Abuse,2.0,2.0,0.0,...,28.0,0.0,53.6,4.0,88.9,0.0,0.0,11.1,Adolescence,2016


In [24]:
res_path = '/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/result'

In [27]:
# 1. Define the logical order for each column
abuse_order = [
    "Physical Abuse", "Neglect", "Emotional Abuse", "Sexual Abuse"
]
occlusal_order = [
    "Normal Occlusion", "Crowding", "Anterior Crossbite", "Open Bite", "Maxillary Protrusion", "Crossbite", "Others"
]
need_treated_order = ["No Treatment Required", "Treatment Required"]
emergency_order = ["Urgent Treatment Required"]
gingivitis_order = ["No Gingivitis", "Gingivitis"]
oral_clean_order = ["Poor", "Fair", "Good"]
habits_order = [
    "None", "Digit Sucking", "Nail biting", "Tongue Thrusting", "Smoking", "Others"
]

# 2. Convert columns to Categorical with the specified order
target_df = df.copy()

target_df['abuse'] = pd.Categorical(target_df['abuse'], categories=abuse_order, ordered=True)
target_df['occlusalRelationship'] = pd.Categorical(target_df['occlusalRelationship'], categories=occlusal_order, ordered=True)
target_df['needTOBEtreated'] = pd.Categorical(target_df['needTOBEtreated'], categories=need_treated_order, ordered=True)
target_df['emergency'] = pd.Categorical(target_df['emergency'], categories=emergency_order, ordered=True)
target_df['gingivitis'] = pd.Categorical(target_df['gingivitis'], categories=gingivitis_order, ordered=True)
target_df['OralCleanStatus'] = pd.Categorical(target_df['OralCleanStatus'], categories=oral_clean_order, ordered=True)
target_df['habits'] = pd.Categorical(target_df['habits'], categories=habits_order, ordered=True)

df = target_df

print("Columns converted to ordered categories.")

Columns converted to ordered categories.


In [29]:
def generate_table1(df, group_col='abuse'):
    """
    Generates a publication-quality Table 1 with group comparisons.
    - Continuous: Mean ± SD, Median [IQR], p-value (Kruskal-Wallis)
    - Categorical: n (%), p-value (Chi-squared)
    """
    stats_list = []
    groups = sorted(df[group_col].dropna().unique())
    n_groups = {g: len(df[df[group_col] == g]) for g in groups}
    total_n = len(df)
    
    # Update group headers with sample size: "Abuse Type (N=...)"
    column_headers = {g: f"{g} (N={n_groups[g]})" for g in groups}
    
    for col in df.columns:
        if col == group_col or col in ['No_All']: # Skip ID columns
            continue
            
        # 1. HANDLE CONTINUOUS DATA
        if pd.api.types.is_numeric_dtype(df[col]):
            # Calculate metrics
            overall_entry = f"{df[col].mean():.1f} ± {df[col].std():.1f}"
            overall_med = f"{df[col].median():.1f} [{df[col].quantile(0.25):.1f} - {df[col].quantile(0.75):.1f}]"
            
            row = {
                'Variable': col,
                'Statistic': 'Mean ± SD',
                'Overall (N={})'.format(total_n): overall_entry,
                'Median [IQR]': overall_med
            }
            
            # Group breakdown
            group_data = []
            for g in groups:
                sub = df[df[group_col] == g][col].dropna()
                group_data.append(sub)
                row[column_headers[g]] = f"{sub.mean():.1f} ± {sub.std():.1f}"
            
            # Statistical Test (Kruskal-Wallis as dental data is often non-normal)
            try:
                h_stat, p_val = stats.kruskal(*group_data)
                row['p-value'] = f"{p_val:.3f}" if p_val >= 0.001 else "<0.001"
            except:
                row['p-value'] = "n/a"
                
            stats_list.append(row)

        # 2. HANDLE CATEGORICAL DATA
        else:
            # Main Variable Header Row
            row_header = {
                'Variable': col.upper(),
                'Statistic': 'n (%)',
                'Overall (N={})'.format(total_n): '',
                'Median [IQR]': '',
                'p-value': ''
            }
            
            # Calculate p-value for the whole variable (Chi-Square)
            contingency = pd.crosstab(df[col], df[group_col])
            try:
                chi2, p_val, _, _ = stats.chi2_contingency(contingency)
                row_header['p-value'] = f"{p_val:.3f}" if p_val >= 0.001 else "<0.001"
            except:
                row_header['p-value'] = "n/a"
            
            stats_list.append(row_header)
            
            # Level-specific rows
            levels = df[col].value_counts(dropna=False).index
            for lv in levels:
                lv_name = "Missing" if pd.isna(lv) else str(lv)
                lv_row = {'Variable': f"  - {lv_name}", 'Statistic': ''}
                
                # Overall %
                cnt = (df[col].isna().sum() if pd.isna(lv) else (df[col] == lv).sum())
                lv_row['Overall (N={})'.format(total_n)] = f"{cnt} ({cnt/total_n*100:.1f}%)"
                
                # Group %
                for g in groups:
                    sub_df = df[df[group_col] == g]
                    g_cnt = (sub_df[col].isna().sum() if pd.isna(lv) else (sub_df[col] == lv).sum())
                    lv_row[column_headers[g]] = f"{g_cnt} ({g_cnt/n_groups[g]*100:.1f}%)"
                
                stats_list.append(lv_row)

    return pd.DataFrame(stats_list)

# Generate and save
timestamp = datetime.now().strftime('%Y%m%d')
table1 = generate_table1(df)
table1.to_csv(f'{res_path}/Table1_Descriptive_Stats_{timestamp}.csv', index=False)

# Display result
table1.head(20)

Unnamed: 0,Variable,Statistic,Overall (N=1235),Median [IQR],p-value,Emotional Abuse (N=201),Neglect (N=328),Physical Abuse (N=646),Sexual Abuse (N=60)
0,CGC,n (%),,,0.109,,,,
1,- tokyo,,1057 (85.6%),,,166 (82.6%),284 (86.6%),557 (86.2%),50 (83.3%)
2,- wakamatsu,,131 (10.6%),,,20 (10.0%),33 (10.1%),69 (10.7%),9 (15.0%)
3,- itabashi,,47 (3.8%),,,15 (7.5%),11 (3.4%),20 (3.1%),1 (1.7%)
4,SEX,n (%),,,<0.001,,,,
5,- Female,,685 (55.5%),,,120 (59.7%),171 (52.1%),336 (52.0%),58 (96.7%)
6,- Male,,550 (44.5%),,,81 (40.3%),157 (47.9%),310 (48.0%),2 (3.3%)
7,age_year,Mean ± SD,9.8 ± 4.1,10.0 [6.0 - 13.0],<0.001,9.7 ± 4.5,8.0 ± 4.1,10.5 ± 3.7,11.1 ± 4.1
8,age_month,Mean ± SD,5.3 ± 3.5,5.0 [2.0 - 8.0],0.040,4.6 ± 3.4,5.4 ± 3.4,5.4 ± 3.6,5.4 ± 3.6
9,age,Mean ± SD,10.2 ± 4.1,10.7 [6.8 - 13.5],<0.001,10.1 ± 4.4,8.5 ± 4.1,11.0 ± 3.7,11.5 ± 4.2
