In [10]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests
from tabulate import tabulate

# 读取数据
file_name = '流失和追踪到.xlsx'
data = pd.read_excel(file_name)

# 分组映射
group_names = {
    0: "追踪到的被试",
    1: "流失的被试"
}

# 连续变量列表
continuous_variables = [
    "Incongruent_Pretest",
    "Interference_Effect_Pretest",
    "Backward_Span_Pretest",
    "1750ms_Pretest",
    "750ms_Pretest",
    "Switching_Cost_Pretest",
    "NoGo_Accuracy_Pretest",
    "Age", 
    "Education_Years", 
    "BMI", 
    "SES"
]

# 分类变量列表
categorical_columns = [
    "Gender",
    "Ethnic",
    "Residence",
    "Only_Child",
    "Smoking_Status",
    "Alcohol_Consumption",
    "Employed",
    "Marital_Status"
]

# 按Group分组
grouped = data.groupby("Group")

# ----------------------
# 连续变量处理（含统计检验）
# ----------------------
# 生成描述统计表
summary_tables = []
f_values = []  # 新增：存储F值
p_values = []

for group, group_data in grouped:
    group_name = group_names.get(group, f"Group {group}")
    continuous_summary = []
    
    for column in continuous_variables:
        if column in group_data.columns:
            clean_data = group_data[column].dropna()
            mean = clean_data.mean()
            std = clean_data.std()
            continuous_summary.append([column, f"{mean:.2f} ± {std:.2f}"])
    
    summary_table = pd.DataFrame(continuous_summary, columns=["Variable", group_name])
    summary_tables.append(summary_table)

# 合并描述统计表
continuous_summary_df = summary_tables[0].merge(summary_tables[1], on="Variable", how="outer")

# 进行方差分析和存储统计量
for var in continuous_variables:
    group0 = data[data['Group'] == 0][var].dropna()
    group1 = data[data['Group'] == 1][var].dropna()
    
    if len(group0) > 1 and len(group1) > 1:
        f_stat, p = stats.f_oneway(group0, group1)
    else:
        f_stat = np.nan
        p = np.nan
    f_values.append(f_stat)
    p_values.append(p)

# FDR校正
reject, pvals_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 添加统计量到表格
continuous_summary_df['F值'] = [f"{f:.2f}" if not np.isnan(f) else 'NA' for f in f_values]
continuous_summary_df['校正p值'] = [f"{p:.4f}" if not np.isnan(p) else 'NA' for p in pvals_corrected]

# ----------------------
# 分类变量处理（含统计检验）
# ----------------------
categorical_summary = []
chi2_values = []  # 新增：存储卡方值
p_values_categorical = []

for column in categorical_columns:
    categories = data[column].dropna().unique()
    
    # 进行卡方检验
    contingency = pd.crosstab(data[column], data['Group'])
    if contingency.shape[0] > 1 and contingency.shape[1] > 1:
        try:
            chi2, p, dof, expected = stats.chi2_contingency(contingency)
        except:
            chi2 = np.nan
            p = np.nan
    else:
        chi2 = np.nan
        p = np.nan
    chi2_values.append(chi2)
    p_values_categorical.append(p)
    
    # 生成频数行
    for category in categories:
        row = [f"{column} - {category}"]
        for group in [0, 1]:
            group_data = grouped.get_group(group)
            count = group_data[group_data[column] == category].shape[0]
            total = len(group_data)
            proportion = (count / total) * 100 if total > 0 else 0
            row.append(f"{count} ({proportion:.1f}%)")
        categorical_summary.append(row)

# FDR校正
reject_cat, pvals_corrected_cat, _, _ = multipletests(p_values_categorical, method='fdr_bh')

# 创建分类变量表格
categorical_headers = ["变量"] + list(group_names.values()) + ["卡方值", "校正p值"]
categorical_summary_df = pd.DataFrame(categorical_summary, columns=categorical_headers[:3])

# 添加统计量到表格（每个变量只显示一次）
stats_dict = {}
for var, chi2, p, p_corr in zip(categorical_columns, chi2_values, p_values_categorical, pvals_corrected_cat):
    stats_dict[var] = {
        'chi2': f"{chi2:.2f}" if not np.isnan(chi2) else 'NA',
        'p_corr': f"{p_corr:.4f}" if not np.isnan(p_corr) else 'NA'
    }

# 添加统计值列
categorical_summary_df['卡方值'] = [
    stats_dict[col.split(' - ')[0]]['chi2'] 
    if col.split(' - ')[0] in stats_dict 
    else 'NA' 
    for col in categorical_summary_df['变量']
]

categorical_summary_df['校正p值'] = [
    stats_dict[col.split(' - ')[0]]['p_corr'] 
    if col.split(' - ')[0] in stats_dict 
    else 'NA' 
    for col in categorical_summary_df['变量']
]

# ----------------------
# 打印最终结果
# ----------------------
print("连续变量比较（均值 ± 标准差，带统计检验）：")
print(tabulate(continuous_summary_df, headers="keys", tablefmt="grid", stralign="left"))

print("\n分类变量比较（频数，百分比，带统计检验）：")
print(tabulate(categorical_summary_df, headers="keys", tablefmt="grid", stralign="left"))

连续变量比较（均值 ± 标准差，带统计检验）：
+----+-----------------------------+-----------------+-----------------+-------+-----------+
|    | Variable                    | 追踪到的被试    | 流失的被试      |   F值 |   校正p值 |
|  0 | Incongruent_Pretest         | 726.75 ± 173.75 | 740.65 ± 166.14 |  0.21 |    0.708  |
+----+-----------------------------+-----------------+-----------------+-------+-----------+
|  1 | Interference_Effect_Pretest | -51.50 ± 93.88  | -37.60 ± 74.38  |  0.82 |    0.5907 |
+----+-----------------------------+-----------------+-----------------+-------+-----------+
|  2 | Backward_Span_Pretest       | 4.87 ± 1.71     | 4.44 ± 1.24     |  2.43 |    0.5907 |
+----+-----------------------------+-----------------+-----------------+-------+-----------+
|  3 | 1750ms_Pretest              | 0.47 ± 0.30     | 0.42 ± 0.30     |  1.07 |    0.5907 |
+----+-----------------------------+-----------------+-----------------+-------+-----------+
|  4 | 750ms_Pretest               | 0.41 ± 0.31     | 0.36 ±

In [1]:
import numpy as np
from scipy.stats import chi2_contingency

# 构建列联表 (行：完成情况，列：研究组别)
observed = np.array([
    # 工作记忆训练组  主动控制组  常规治疗组
    [32,            33,         29],      # 完成
    [16,            15,         19]       # 未完成
])

# 执行卡方检验
chi2_stat, p_value, dof, expected = chi2_contingency(observed)

# 计算流失率
total_participants = observed.sum(axis=0)
attrition_counts = observed[1]
attrition_rates = attrition_counts / total_participants * 100

# 专业结果呈现
print("临床试验流失分析报告")
print("="*40)
print(f"总样本量: {observed.sum()} 参与者")
print(f"完成率: {observed[0].sum()/observed.sum():.1%}")
print("\n各组详细信息：")
groups = ["Working Memory", "Active Control", "Treatment-as-usual"]
for group, total, attrition, rate in zip(groups, total_participants, attrition_counts, attrition_rates):
    print(f"- {group} (n={total}):")
    print(f"  完成: {total - attrition} | 流失: {attrition} ({rate:.1f}%)")

print("\n统计检验结果：")
print(f"卡方值(χ²): {chi2_stat:.3f}")
print(f"自由度(df): {dof}")
print(f"P值: {p_value:.4f}")

alpha = 0.05
if p_value < alpha:
    print(f"\n结论：三组间流失率存在统计学显著差异 (p < {alpha})")
else:
    print(f"\n结论：三组间流失率无统计学显著差异 (p ≥ {alpha})")

print("\n期望频数表：")
print(np.round(expected, 2))

临床试验流失分析报告
总样本量: 144 参与者
完成率: 65.3%

各组详细信息：
- Working Memory (n=48):
  完成: 32 | 流失: 16 (33.3%)
- Active Control (n=48):
  完成: 33 | 流失: 15 (31.2%)
- Treatment-as-usual (n=48):
  完成: 29 | 流失: 19 (39.6%)

统计检验结果：
卡方值(χ²): 0.797
自由度(df): 2
P值: 0.6715

结论：三组间流失率无统计学显著差异 (p ≥ 0.05)

期望频数表：
[[31.33 31.33 31.33]
 [16.67 16.67 16.67]]
