In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
import statsmodels.api as sm
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import pointbiserialr, spearmanr, chi2_contingency, shapiro


file_path = 'External_Dataset.csv'
data = pd.read_csv(file_path)

In [34]:
# 分组
groups = data.groupby('group')
# 分类数据和连续数据
columns_to_analyzes = ['age', 'HAMD', 'HAMA', 'SDS', 'PHQ', 'height', 'weight', 'BMI','prediction_score']
categorical_columns = ['nation', 'gender', 'marriage', 'smoking', 'drinking', 'family']

In [35]:
depressed_data = data[data['group'] == 'depressed']
healthy_data = data[data['group'] == 'healthy']
for columns_to_analyze in columns_to_analyzes:
    # 从DataFrame中提取特定特征的数据
    feature_data = depressed_data[columns_to_analyze]
    # 进行Shapiro-Wilk正态性检验
    stat, p = shapiro(feature_data)
    alpha = 0.05
    if p > alpha:
        print(f'{columns_to_analyze}:Statistics={stat:.3f}, p={p:.3f},正态分布 (接受H0)')
    else:
        print(f'{columns_to_analyze}:Statistics={stat:.3f}, p={p:.3f},不是正态分布 (拒绝H0)')

age:Statistics=0.911, p=0.000,不是正态分布 (拒绝H0)
hamd17:Statistics=0.976, p=0.132,正态分布 (接受H0)
HAMA:Statistics=0.985, p=0.473,正态分布 (接受H0)
SDS:Statistics=0.969, p=0.047,不是正态分布 (拒绝H0)
PHQ:Statistics=0.977, p=0.168,正态分布 (接受H0)
height:Statistics=0.982, p=0.301,正态分布 (接受H0)
weight:Statistics=0.960, p=0.013,不是正态分布 (拒绝H0)
BMI:Statistics=0.983, p=0.359,正态分布 (接受H0)
prediction_score:Statistics=0.910, p=0.000,不是正态分布 (拒绝H0)


In [36]:
for columns_to_analyze in columns_to_analyzes:
    # 从DataFrame中提取特定特征的数据
    feature_data = healthy_data[columns_to_analyze]
    # 进行Shapiro-Wilk正态性检验
    stat, p = shapiro(feature_data)
    alpha = 0.05
    if p > alpha:
        print(f'{columns_to_analyze}:Statistics={stat:.3f}, p={p:.3f},正态分布 (接受H0)')
    else:
        print(f'{columns_to_analyze}:Statistics={stat:.3f}, p={p:.3f},不是正态分布 (拒绝H0)')

age:Statistics=0.691, p=0.000,不是正态分布 (拒绝H0)
hamd17:Statistics=0.850, p=0.000,不是正态分布 (拒绝H0)
HAMA:Statistics=0.647, p=0.000,不是正态分布 (拒绝H0)
SDS:Statistics=0.950, p=0.004,不是正态分布 (拒绝H0)
PHQ:Statistics=0.886, p=0.000,不是正态分布 (拒绝H0)
height:Statistics=0.957, p=0.009,不是正态分布 (拒绝H0)
weight:Statistics=0.919, p=0.000,不是正态分布 (拒绝H0)
BMI:Statistics=0.942, p=0.001,不是正态分布 (拒绝H0)
prediction_score:Statistics=0.931, p=0.000,不是正态分布 (拒绝H0)


In [37]:
# 3. 计算 mean, std, 中位数, 25%-75% 区间
for group_name, group_data in groups:
    print(f"\n{group_name} Group Statistics:")
    for columns_to_analyze in columns_to_analyzes:
        column_data = group_data[columns_to_analyze].dropna()
        print(f"{columns_to_analyze}: mean={column_data.mean():.3f}, std={column_data.std():.3f}, median={column_data.median():.3f}, 25%-75%={np.percentile(column_data, [25, 75])}")


depressed Group Statistics:
age: mean=32.288, std=11.986, median=29.000, 25%-75%=[23.   37.75]
hamd17: mean=19.938, std=4.610, median=20.000, 25%-75%=[16. 23.]
HAMA: mean=20.438, std=5.621, median=20.500, 25%-75%=[16.75 23.25]
SDS: mean=56.950, std=8.431, median=57.000, 25%-75%=[52. 63.]
PHQ: mean=17.462, std=4.888, median=17.500, 25%-75%=[13.75 22.  ]
height: mean=166.950, std=7.650, median=166.000, 25%-75%=[162.   171.25]
weight: mean=62.425, std=12.635, median=60.000, 25%-75%=[53.75 70.  ]
BMI: mean=22.308, std=3.682, median=22.130, 25%-75%=[19.4425 24.8625]
prediction_score: mean=0.690, std=0.235, median=0.720, 25%-75%=[0.59585011 0.87308854]

healthy Group Statistics:
age: mean=24.113, std=7.821, median=21.000, 25%-75%=[20. 27.]
hamd17: mean=2.475, std=2.595, median=2.000, 25%-75%=[0. 4.]
HAMA: mean=1.850, std=3.069, median=1.000, 25%-75%=[0. 3.]
SDS: mean=36.400, std=9.098, median=35.500, 25%-75%=[28.75 43.  ]
PHQ: mean=4.362, std=4.101, median=3.000, 25%-75%=[1. 7.]
height: mea

In [38]:
# t检验
for columns_to_analyze in columns_to_analyzes:
    t_stat, p_val = stats.ttest_ind(groups.get_group('depressed')[columns_to_analyze].dropna(),
                                    groups.get_group('healthy')[columns_to_analyze].dropna(), equal_var=False)
    print(f"{columns_to_analyze} t-test: t={t_stat:.3f}, p={p_val:.3f}")

age t-test: t=5.109, p=0.000
hamd17 t-test: t=29.526, p=0.000
HAMA t-test: t=25.961, p=0.000
SDS t-test: t=14.818, p=0.000
PHQ t-test: t=18.363, p=0.000
height t-test: t=-3.001, p=0.003
weight t-test: t=-1.322, p=0.188
BMI t-test: t=-0.033, p=0.974
prediction_score t-test: t=10.710, p=0.000


In [39]:
# Mann-Whitney U 检验
for columns_to_analyze in columns_to_analyzes:
    u_stat, p_val = stats.mannwhitneyu(groups.get_group('depressed')[columns_to_analyze].dropna(),
                                       groups.get_group('healthy')[columns_to_analyze].dropna())
    print(f"{columns_to_analyze} Mann-Whitney U test: U={u_stat:.3f}, p={p_val:.3f}")


age Mann-Whitney U test: U=4610.000, p=0.000
hamd17 Mann-Whitney U test: U=6399.500, p=0.000
HAMA Mann-Whitney U test: U=6366.500, p=0.000
SDS Mann-Whitney U test: U=6047.000, p=0.000
PHQ Mann-Whitney U test: U=6216.000, p=0.000
height Mann-Whitney U test: U=2462.000, p=0.012
weight Mann-Whitney U test: U=2931.500, p=0.360
BMI Mann-Whitney U test: U=3309.000, p=0.711
prediction_score Mann-Whitney U test: U=5563.000, p=0.000


In [40]:
results = pd.DataFrame(columns=['group', 'column', 'value', 'count'])

# 统计不同组中分类变量中0和1的个数
for group_name, group_data in groups:
    for column in categorical_columns:
        counts = group_data[column].value_counts().reset_index()
        counts.columns = ['value', 'count']
        counts['group'] = group_name
        counts['column'] = column
        results = pd.concat([results, counts], ignore_index=True)

# 打印结果
print(results)

        group    column value count
0   depressed    nation     1    75
1   depressed    nation     2     5
2   depressed    gender     2    53
3   depressed    gender     1    27
4   depressed  marriage     0    42
5   depressed  marriage     1    38
6   depressed   smoking     0    60
7   depressed   smoking     1    20
8   depressed  drinking     0    62
9   depressed  drinking     1    18
10  depressed    family     0    64
11  depressed    family     1    16
12    healthy    nation     1    66
13    healthy    nation     2    14
14    healthy    gender     2    42
15    healthy    gender     1    38
16    healthy  marriage     0    66
17    healthy  marriage     1    14
18    healthy   smoking     0    67
19    healthy   smoking     1    13
20    healthy  drinking     0    63
21    healthy  drinking     1    17
22    healthy    family     0    80


In [41]:
chi2_results = pd.DataFrame(columns=['column', 'chi2', 'p-value', 'dof', 'expected'])

# 计算卡方检验
for categorical_column in categorical_columns:
    # 创建列联表
    contingency_table = pd.crosstab(data['group'], data[categorical_column])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    # 存储结果
    chi2_results = pd.concat([chi2_results, pd.DataFrame({
        'column': [categorical_column],
        'chi2': [chi2],
        'p-value': [p],
        'dof': [dof],
        'expected': [expected.tolist()]
    })], ignore_index=True)

# 打印卡方检验结果
print(chi2_results)

     column       chi2   p-value dof                      expected
0    nation   3.822322  0.050574   1    [[70.5, 9.5], [70.5, 9.5]]
1    gender   2.591093  0.107466   1  [[32.5, 47.5], [32.5, 47.5]]
2  marriage  15.071225  0.000104   1  [[54.0, 26.0], [54.0, 26.0]]
3   smoking   1.374374  0.241062   1  [[63.5, 16.5], [63.5, 16.5]]
4  drinking   0.000000  1.000000   1  [[62.5, 17.5], [62.5, 17.5]]
5    family  15.625000  0.000077   1    [[72.0, 8.0], [72.0, 8.0]]


  chi2_results = pd.concat([chi2_results, pd.DataFrame({
