In [19]:
import pandas as pd

# 文件路径
file_path = "/home/ubuntu/graduation-project/data/outcome_sex_v5.1/DeepSeek-R1-Distill-Qwen-1.5B_v5.1_sex_chinese_statistics.xlsx"

# 读取 Excel 文件
df = pd.read_excel(file_path)

# 输出列名
print(df)



  Sex_Chinese Gestational diabetes mellitus Type 1 diabetes Type 2 diabetes
0      Female                   668 (66.8%)     182 (18.2%)     202 (20.2%)
1        Male                   303 (30.3%)     778 (77.8%)     753 (75.3%)
2     Unknown                     29 (2.9%)       40 (4.0%)       45 (4.5%)


In [13]:
# 更新文件路径
file_path = "/home/ubuntu/graduation-project/Code/统计/性别/real-sex-v1.xlsx"

# 读取新的 Excel 文件
realdf = pd.read_excel(file_path)

# 输出列名
print(realdf)

  Unnamed: 0           T1DM             T2DM              GDM          Other  \
0      Total           4818           977880            22331           8534   
1     Female  2311 (47.97%)  465189 (47.57%)  22331 (100.00%)  4473 (52.41%)   
2       Male  2507 (52.03%)  512691 (52.43%)        0 (0.00%)  4061 (47.59%)   

               DM    Total  
0           20341  1033904  
1  16279 (80.03%)   510583  
2   4062 (19.97%)   523321  


In [18]:
real_observed = realdf[["T1DM", "T2DM", "GDM"]].values
print(real_observed)

# 模型生成数据
model_observed = df[['Gestational diabetes mellitus', 'Type 1 diabetes', 'Type 2 diabetes']].values
model_observed

[['4818' '977880' '22331']
 ['2311 (47.97%)' '465189 (47.57%)' '22331 (100.00%)']
 ['2507 (52.03%)' '512691 (52.43%)' '0 (0.00%)']]


array([['668 (66.8%)', '182 (18.2%)', '202 (20.2%)'],
       ['303 (30.3%)', '778 (77.8%)', '753 (75.3%)'],
       ['29 (2.9%)', '40 (4.0%)', '45 (4.5%)']], dtype=object)

In [20]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import re

# 1. 从真实数据和模型数据中提取纯数字
def extract_number(value):
    if isinstance(value, str) and '(' in value:
        return int(re.search(r'(\d+)\s*\(', value).group(1))
    return value

# 2. 处理真实数据
real_gender_counts = {}
# 真实数据中的疾病类型
diseases = ['T1DM', 'T2DM', 'GDM']
# 真实数据中的性别行索引
gender_indices = {'Female': 1, 'Male': 2}

for disease in diseases:
    real_gender_counts[disease] = {}
    for gender, idx in gender_indices.items():
        value = realdf.loc[idx, disease]
        real_gender_counts[disease][gender] = extract_number(value)

# 3. 处理模型生成数据
model_gender_counts = {}
# 模型数据中疾病类型的映射
disease_mapping = {
    'GDM': 'Gestational diabetes mellitus',
    'T1DM': 'Type 1 diabetes',
    'T2DM': 'Type 2 diabetes'
}
# 模型数据中的性别行索引
model_gender_indices = {'Female': 0, 'Male': 1}

for real_disease, model_disease in disease_mapping.items():
    model_gender_counts[real_disease] = {}
    for gender, idx in model_gender_indices.items():
        value = df.loc[idx, model_disease]
        model_gender_counts[real_disease][gender] = extract_number(value)

# 4. 对各种疾病类型分别进行卡方检验
results = {}
for disease in diseases:
    # 创建观测值表格
    observed = np.array([
        [real_gender_counts[disease]['Female'], real_gender_counts[disease]['Male']],
        [model_gender_counts[disease]['Female'], model_gender_counts[disease]['Male']]
    ])
    
    # 进行卡方检验
    chi2, p, dof, expected = chi2_contingency(observed)
    
    results[disease] = {
        'chi2': chi2,
        'p_value': p,
        'dof': dof,
        'expected': expected,
        'observed': observed
    }

# 5. 输出结果
for disease, result in results.items():
    print(f"\n卡方检验结果 - {disease}:")
    print(f"卡方统计量: {result['chi2']:.4f}")
    print(f"p值: {result['p_value']:.8f}")
    print(f"自由度: {result['dof']}")
    print("观测频数表:")
    print(f"  真实数据: Female={result['observed'][0][0]}, Male={result['observed'][0][1]}")
    print(f"  模型数据: Female={result['observed'][1][0]}, Male={result['observed'][1][1]}")
    
    # 判断结果显著性
    alpha = 0.05
    if result['p_value'] < alpha:
        print(f"结论: 在{alpha}显著性水平下，模型生成的性别分布与真实数据存在显著差异")
    else:
        print(f"结论: 在{alpha}显著性水平下，未发现模型生成的性别分布与真实数据存在显著差异")


卡方检验结果 - T1DM:
卡方统计量: 273.4072
p值: 0.00000000
自由度: 1
观测频数表:
  真实数据: Female=2311, Male=2507
  模型数据: Female=182, Male=778
结论: 在0.05显著性水平下，模型生成的性别分布与真实数据存在显著差异

卡方检验结果 - T2DM:
卡方统计量: 265.9541
p值: 0.00000000
自由度: 1
观测频数表:
  真实数据: Female=465189, Male=512691
  模型数据: Female=202, Male=753
结论: 在0.05显著性水平下，模型生成的性别分布与真实数据存在显著差异

卡方检验结果 - GDM:
卡方统计量: 7035.8875
p值: 0.00000000
自由度: 1
观测频数表:
  真实数据: Female=22331, Male=0
  模型数据: Female=668, Male=303
结论: 在0.05显著性水平下，模型生成的性别分布与真实数据存在显著差异


In [21]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import re

# 1. 从真实数据和模型数据中提取纯数字
def extract_number(value):
    if isinstance(value, str) and '(' in value:
        return int(re.search(r'(\d+)\s*\(', value).group(1))
    return value

# 2. 处理真实数据
real_gender_counts = {}
# 真实数据中的疾病类型
diseases = ['T1DM', 'T2DM', 'GDM']
# 真实数据中的性别行索引
gender_indices = {'Female': 1, 'Male': 2}

for disease in diseases:
    real_gender_counts[disease] = {}
    for gender, idx in gender_indices.items():
        value = realdf.loc[idx, disease]
        real_gender_counts[disease][gender] = extract_number(value)

# 3. 处理模型生成数据
model_gender_counts = {}
# 模型数据中疾病类型的映射
disease_mapping = {
    'GDM': 'Gestational diabetes mellitus',
    'T1DM': 'Type 1 diabetes',
    'T2DM': 'Type 2 diabetes'
}
# 模型数据中的性别行索引
model_gender_indices = {'Female': 0, 'Male': 1}

for real_disease, model_disease in disease_mapping.items():
    model_gender_counts[real_disease] = {}
    for gender, idx in model_gender_indices.items():
        value = df.loc[idx, model_disease]
        model_gender_counts[real_disease][gender] = extract_number(value)

# 4. 对各种疾病类型分别进行卡方检验
results = {}
for disease in diseases:
    # 创建观测值表格
    observed = np.array([
        [real_gender_counts[disease]['Female'], real_gender_counts[disease]['Male']],
        [model_gender_counts[disease]['Female'], model_gender_counts[disease]['Male']]
    ])
    
    # 进行卡方检验
    chi2, p, dof, expected = chi2_contingency(observed)
    
    results[disease] = {
        'chi2': chi2,
        'p_value': p,
        'dof': dof,
        'expected': expected,
        'observed': observed
    }

# 5. 输出结果
for disease, result in results.items():
    print(f"\n卡方检验结果 - {disease}:")
    print(f"卡方统计量: {result['chi2']:.4f}")
    print(f"p值: {result['p_value']:.4e}")  # 使用科学计数法
    print(f"自由度: {result['dof']}")
    print("观测频数表:")
    print(f"  真实数据: Female={result['observed'][0][0]}, Male={result['observed'][0][1]}")
    print(f"  模型数据: Female={result['observed'][1][0]}, Male={result['observed'][1][1]}")
    
    # 判断结果显著性
    alpha = 0.05
    if result['p_value'] < alpha:
        print(f"结论: 在{alpha}显著性水平下，模型生成的性别分布与真实数据存在显著差异")
    else:
        print(f"结论: 在{alpha}显著性水平下，未发现模型生成的性别分布与真实数据存在显著差异")


卡方检验结果 - T1DM:
卡方统计量: 273.4072
p值: 2.0527e-61
自由度: 1
观测频数表:
  真实数据: Female=2311, Male=2507
  模型数据: Female=182, Male=778
结论: 在0.05显著性水平下，模型生成的性别分布与真实数据存在显著差异

卡方检验结果 - T2DM:
卡方统计量: 265.9541
p值: 8.6442e-60
自由度: 1
观测频数表:
  真实数据: Female=465189, Male=512691
  模型数据: Female=202, Male=753
结论: 在0.05显著性水平下，模型生成的性别分布与真实数据存在显著差异

卡方检验结果 - GDM:
卡方统计量: 7035.8875
p值: 0.0000e+00
自由度: 1
观测频数表:
  真实数据: Female=22331, Male=0
  模型数据: Female=668, Male=303
结论: 在0.05显著性水平下，模型生成的性别分布与真实数据存在显著差异


In [22]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import re

# 1. 从真实数据和模型数据中提取纯数字
def extract_number(value):
    if isinstance(value, str) and '(' in value:
        return int(re.search(r'(\d+)\s*\(', value).group(1))
    return value

# 2. 处理真实数据
real_gender_counts = {}
# 真实数据中的疾病类型
diseases = ['T1DM', 'T2DM', 'GDM']
# 真实数据中的性别行索引
gender_indices = {'Female': 1, 'Male': 2}

for disease in diseases:
    real_gender_counts[disease] = {}
    for gender, idx in gender_indices.items():
        value = realdf.loc[idx, disease]
        real_gender_counts[disease][gender] = extract_number(value)

# 3. 处理模型生成数据
model_gender_counts = {}
# 模型数据中疾病类型的映射
disease_mapping = {
    'GDM': 'Gestational diabetes mellitus',
    'T1DM': 'Type 1 diabetes',
    'T2DM': 'Type 2 diabetes'
}
# 模型数据中的性别行索引
model_gender_indices = {'Female': 0, 'Male': 1}

for real_disease, model_disease in disease_mapping.items():
    model_gender_counts[real_disease] = {}
    for gender, idx in model_gender_indices.items():
        value = df.loc[idx, model_disease]
        model_gender_counts[real_disease][gender] = extract_number(value)

# 4. 对各种疾病类型分别进行卡方检验
results = {}
for disease in diseases:
    # 创建观测值表格
    observed = np.array([
        [real_gender_counts[disease]['Female'], real_gender_counts[disease]['Male']],
        [model_gender_counts[disease]['Female'], model_gender_counts[disease]['Male']]
    ])
    
    # 进行卡方检验
    chi2, p, dof, expected = chi2_contingency(observed)
    
    results[disease] = {
        'chi2': chi2,
        'p_value': p,
        'dof': dof,
        'expected': expected,
        'observed': observed
    }

# 5. 准备保存的数据
data_to_save = []
for disease, result in results.items():
    model_name = "DeepSeek-R1-Distill-Qwen-1.5B"  # 替换为你的模型名称
    male_p_value = result['p_value']
    female_p_value = result['p_value']  # 这里假设男女p值相同，因为卡方检验是对整体分布的检验
    data_to_save.append([model_name, f"{male_p_value:.4e}", f"{female_p_value:.4e}", disease])

# 6. 创建 DataFrame
df_to_save = pd.DataFrame(data_to_save, columns=['model', '男性_p值', '女性_p值', 'disease'])

# 7. 保存到 CSV 文件
output_file = "chi_square_results.csv"
df_to_save.to_csv(output_file, index=False, encoding='utf-8')

print(f"结果已保存到 {output_file}")

结果已保存到 chi_square_results.csv


In [24]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import re

# 1. 从真实数据和模型数据中提取纯数字
def extract_number(value):
    if isinstance(value, str) and '(' in value:
        return int(re.search(r'(\d+)\s*\(', value).group(1))
    return value

# 2. 处理真实数据
real_gender_counts = {}
# 真实数据中的疾病类型
diseases = ['T1DM', 'T2DM']  # 移除 GDM
# 真实数据中的性别行索引
gender_indices = {'Female': 1, 'Male': 2}

for disease in diseases:
    real_gender_counts[disease] = {}
    for gender, idx in gender_indices.items():
        value = realdf.loc[idx, disease]
        real_gender_counts[disease][gender] = extract_number(value)

# 3. 处理模型生成数据
model_gender_counts = {}
# 模型数据中疾病类型的映射
disease_mapping = {
    'T1DM': 'Type 1 diabetes',
    'T2DM': 'Type 2 diabetes'
}  # 移除 GDM
# 模型数据中的性别行索引
model_gender_indices = {'Female': 0, 'Male': 1}

for real_disease, model_disease in disease_mapping.items():
    model_gender_counts[real_disease] = {}
    for gender, idx in model_gender_indices.items():
        value = df.loc[idx, model_disease]
        model_gender_counts[real_disease][gender] = extract_number(value)

# 4. 对各种疾病类型分别进行卡方检验
results = {}
for disease in diseases:
    # 创建观测值表格
    observed = np.array([
        [real_gender_counts[disease]['Female'], real_gender_counts[disease]['Male']],
        [model_gender_counts[disease]['Female'], model_gender_counts[disease]['Male']]
    ])
    
    # 进行卡方检验
    chi2, p, dof, expected = chi2_contingency(observed)
    
    results[disease] = {
        'chi2': chi2,
        'p_value': p,
        'dof': dof,
        'expected': expected,
        'observed': observed
    }

# 5. 准备保存的数据
data_to_save = []
for disease, result in results.items():
    model_name = "DeepSeek-R1-Distill-Qwen-1.5B"  # 替换为你的模型名称
    male_p_value = result['p_value']
    female_p_value = result['p_value']  # 这里假设男女p值相同，因为卡方检验是对整体分布的检验
    data_to_save.append([model_name, f"{male_p_value:.4e}", f"{female_p_value:.4e}", disease])

# 6. 创建 DataFrame
df_to_save = pd.DataFrame(data_to_save, columns=['model', '男性_p值', '女性_p值', 'disease'])

# 7. 保存到 Excel 文件
output_file = "chi_square_results_no_gdm.xlsx"
df_to_save.to_excel(output_file, index=False)

print(f"结果已保存到 {output_file}")

结果已保存到 chi_square_results_no_gdm.xlsx
