## HAMMD

In [1]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(42)

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet1')

# 定义变量
variables = [
    'HAMD24(total)', 
    'MAMD24(Anxiety/Somatization)', 'MAMD24(Cognitive Impairment)', 'MAMD24(Diurnal Variation)', 
    'MAMD24(Retardation)', 'MAMD24(Sleep Disturbance)', 'MAMD24(Despair)'
]

# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

Group_col = df_filtered['Group']
behavior_data = df_filtered[variables]

# 将插补后的数据转换为DataFrame
behavior_data_imputed_df = pd.DataFrame(behavior_data, columns=behavior_data.columns)

# 将聚类列添加回插补后的数据
data_imputed = pd.concat([Group_col, behavior_data_imputed_df], axis=1)

results = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])

for var in variables:
    Group1 = data_imputed[data_imputed['Group'] == 0][var]  # Group I (0)
    Group2 = data_imputed[data_imputed['Group'] == 1][var]  # Group II (1)
    
    t_stat, p_value = ttest_ind(Group1, Group2)  # 进行比较
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 
        't-statistic': [t_stat], 
        'p-value': [p_value]
    })

    if not result.empty:
        results = pd.concat([results, result], ignore_index=True)

# 对所有变量的p值进行FDR校正
p_values = results['p-value'].astype(float)
_, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 将校正后的p值添加到结果表中
results['p-value (FDR corrected)'] = p_values_corrected.round(3)
results['p-value'] = results['p-value'].apply(lambda x: f"{x:.3f}")

# 打印三线表
print(results.to_string(index=False))
print('=' * len(results.to_string(index=False).split('\n')[0]))

                    Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  p-value (FDR corrected)
               HAMD24(total)      10.96±3.54       13.36±6.36    -2.746849   0.007                    0.024
MAMD24(Anxiety/Somatization)       2.19±1.06        2.51±1.86    -1.237609   0.218                    0.305
MAMD24(Cognitive Impairment)       2.61±1.40        3.71±2.45    -3.225192   0.002                    0.011
   MAMD24(Diurnal Variation)       0.00±0.00        0.06±0.29    -1.780625   0.077                    0.135
         MAMD24(Retardation)       3.24±1.66        3.95±1.85    -2.413340   0.017                    0.040
   MAMD24(Sleep Disturbance)       1.58±1.27        1.65±1.17    -0.353844   0.724                    0.724
             MAMD24(Despair)       1.33±1.74        1.45±1.91    -0.393545   0.695                    0.724


## PANSS

In [2]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(42)

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet1')

variables = ['PANSS-N', 'PANSS-P', 'PANSS-G', 'PANSS-T']

# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

Group_col = df_filtered['Group']
behavior_data = df_filtered[variables]

# 将插补后的数据转换为DataFrame
behavior_data_imputed_df = pd.DataFrame(behavior_data, columns=behavior_data.columns)

# 将聚类列添加回插补后的数据
data_imputed = pd.concat([Group_col, behavior_data_imputed_df], axis=1)

results = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])

for var in variables:
    Group1 = data_imputed[data_imputed['Group'] == 0][var]  # Group I (0)
    Group2 = data_imputed[data_imputed['Group'] == 1][var]  # Group II (1)
    
    t_stat, p_value = ttest_ind(Group1, Group2)  # 进行比较
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 
        't-statistic': [t_stat], 
        'p-value': [p_value]
    })

    if not result.empty:
        results = pd.concat([results, result], ignore_index=True)

# 对所有变量的p值进行FDR校正
p_values = results['p-value'].astype(float)
_, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 将校正后的p值添加到结果表中
results['p-value (FDR corrected)'] = p_values_corrected.round(3)
results['p-value'] = results['p-value'].apply(lambda x: f"{x:.3f}")

# 打印三线表
print(results.to_string(index=False))
print('=' * len(results.to_string(index=False).split('\n')[0]))

Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  p-value (FDR corrected)
 PANSS-N      20.24±6.92       21.69±6.77    -1.275310   0.204                    0.204
 PANSS-P      20.28±4.86       22.78±4.21    -3.316551   0.001                    0.005
 PANSS-G      37.94±8.24       40.74±5.86    -2.383937   0.018                    0.025
 PANSS-T     78.46±16.08      85.22±12.88    -2.807073   0.006                    0.011


In [3]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(42)

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet1')

variables = ['Negative', 'Positive', 'Affective', 'Cognitive']

# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

Group_col = df_filtered['Group']
behavior_data = df_filtered[variables]

# 将插补后的数据转换为DataFrame
behavior_data_imputed_df = pd.DataFrame(behavior_data, columns=behavior_data.columns)

# 将聚类列添加回插补后的数据
data_imputed = pd.concat([Group_col, behavior_data_imputed_df], axis=1)

results = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])

for var in variables:
    Group1 = data_imputed[data_imputed['Group'] == 0][var]  # Group I (0)
    Group2 = data_imputed[data_imputed['Group'] == 1][var]  # Group II (1)
    
    t_stat, p_value = ttest_ind(Group1, Group2)  # 进行比较
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 
        't-statistic': [t_stat], 
        'p-value': [p_value]
    })

    if not result.empty:
        results = pd.concat([results, result], ignore_index=True)

# 对所有变量的p值进行FDR校正
p_values = results['p-value'].astype(float)
_, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 将校正后的p值添加到结果表中
results['p-value (FDR corrected)'] = p_values_corrected.round(3)
results['p-value'] = results['p-value'].apply(lambda x: f"{x:.3f}")

# 打印三线表
print(results.to_string(index=False))
print('=' * len(results.to_string(index=False).split('\n')[0]))

 Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  p-value (FDR corrected)
 Negative       7.73±2.70        8.29±2.59    -1.266398   0.207                    0.207
 Positive       5.58±1.77        6.62±1.73    -3.550212   0.001                    0.002
Affective       5.35±0.93        5.72±0.76    -2.612012   0.010                    0.020
Cognitive       9.19±2.05        9.90±1.57    -2.356608   0.020                    0.026


## HAMMA

In [4]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(42)

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet1')

variables = ['HAMA14(total)', 
             'HAMA14(Psychic Anxiety)', 'HAMA14(Somatic Anxiety)']


# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

Group_col = df_filtered['Group']
behavior_data = df_filtered[variables]

# 将插补后的数据转换为DataFrame
behavior_data_imputed_df = pd.DataFrame(behavior_data, columns=behavior_data.columns)

# 将聚类列添加回插补后的数据
data_imputed = pd.concat([Group_col, behavior_data_imputed_df], axis=1)

results = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])

for var in variables:
    Group1 = data_imputed[data_imputed['Group'] == 0][var]  # Group I (0)
    Group2 = data_imputed[data_imputed['Group'] == 1][var]  # Group II (1)
    
    t_stat, p_value = ttest_ind(Group1, Group2)  # 进行比较
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 
        't-statistic': [t_stat], 
        'p-value': [p_value]
    })

    if not result.empty:
        results = pd.concat([results, result], ignore_index=True)

# 对所有变量的p值进行FDR校正
p_values = results['p-value'].astype(float)
_, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 将校正后的p值添加到结果表中
results['p-value (FDR corrected)'] = p_values_corrected.round(3)
results['p-value'] = results['p-value'].apply(lambda x: f"{x:.3f}")

# 打印三线表
print(results.to_string(index=False))
print('=' * len(results.to_string(index=False).split('\n')[0]))

               Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  p-value (FDR corrected)
          HAMA14(total)       3.91±1.94        5.55±5.05    -2.505318   0.013                     0.02
HAMA14(Psychic Anxiety)       3.72±1.72        4.62±2.21    -2.698276   0.008                     0.02
HAMA14(Somatic Anxiety)       0.30±0.58        0.94±3.29    -1.565182   0.120                     0.12


## BRMS

In [5]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(42)

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet1')

variables =  ['BRMS(total)']


# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

Group_col = df_filtered['Group']
behavior_data = df_filtered[variables]

# 将插补后的数据转换为DataFrame
behavior_data_imputed_df = pd.DataFrame(behavior_data, columns=behavior_data.columns)

# 将聚类列添加回插补后的数据
data_imputed = pd.concat([Group_col, behavior_data_imputed_df], axis=1)

results = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])

for var in variables:
    Group1 = data_imputed[data_imputed['Group'] == 0][var]  # Group I (0)
    Group2 = data_imputed[data_imputed['Group'] == 1][var]  # Group II (1)
    
    t_stat, p_value = ttest_ind(Group1, Group2)  # 进行比较
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 
        't-statistic': [t_stat], 
        'p-value': [p_value]
    })

    if not result.empty:
        results = pd.concat([results, result], ignore_index=True)

# 对所有变量的p值进行FDR校正
p_values = results['p-value'].astype(float)
_, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 将校正后的p值添加到结果表中
results['p-value (FDR corrected)'] = p_values_corrected.round(3)
results['p-value'] = results['p-value'].apply(lambda x: f"{x:.3f}")

# 打印三线表
print(results.to_string(index=False))
print('=' * len(results.to_string(index=False).split('\n')[0]))

   Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  p-value (FDR corrected)
BRMS(total)       7.69±3.06        7.79±4.30      -0.1721   0.864                    0.864


## 血液

### 免疫

In [6]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(42)

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet2')
variables = ['WBC', 'NEUT', 'LYMPH', 'MONO', 'PLT', 'CRP', 'NLR', 'PLR', 'MLR', 'SIII']#免疫


# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

Group_col = df_filtered['Group']
behavior_data = df_filtered[variables]

# 将插补后的数据转换为DataFrame
behavior_data_imputed_df = pd.DataFrame(behavior_data, columns=behavior_data.columns)

# 将聚类列添加回插补后的数据
data_imputed = pd.concat([Group_col, behavior_data_imputed_df], axis=1)

results = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])

for var in variables:
    Group1 = data_imputed[data_imputed['Group'] == 0][var]  # Group I (0)
    Group2 = data_imputed[data_imputed['Group'] == 1][var]  # Group II (1)
    
    t_stat, p_value = ttest_ind(Group1, Group2)  # 进行比较
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 
        't-statistic': [t_stat], 
        'p-value': [p_value]
    })

    if not result.empty:
        results = pd.concat([results, result], ignore_index=True)

# 对所有变量的p值进行FDR校正
p_values = results['p-value'].astype(float)
_, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 将校正后的p值添加到结果表中
results['p-value (FDR corrected)'] = p_values_corrected.round(3)
results['p-value'] = results['p-value'].apply(lambda x: f"{x:.3f}")

# 打印三线表
print(results.to_string(index=False))
print('=' * len(results.to_string(index=False).split('\n')[0]))

Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  p-value (FDR corrected)
     WBC       5.95±1.47        6.18±1.57    -0.882227   0.379                    0.421
    NEUT      54.97±9.37       59.31±9.36    -2.785969   0.006                    0.015
   LYMPH      36.32±8.77       31.75±9.13     3.062366   0.003                    0.009
    MONO       6.23±1.76        6.29±1.63    -0.192420   0.848                    0.848
     PLT    204.15±55.00     217.08±57.81    -1.372592   0.172                    0.215
     CRP       5.24±3.12        7.11±5.37    -2.501847   0.013                    0.022
     NLR       1.68±0.74        2.23±1.56    -2.663546   0.009                    0.017
     PLR       5.96±2.15        7.44±3.01    -3.334760   0.001                    0.008
     MLR       0.18±0.08        0.23±0.16    -2.209554   0.029                    0.041
    SIII   341.53±167.44    459.22±254.79    -3.228350   0.002                    0.008


### 代谢

In [7]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(42)

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet2')
variables = ['AST/ALT', 'LDH', 'CK', 'TG', 'CHOL', 'HDL-C', 'LDL-C']#代谢


# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

Group_col = df_filtered['Group']
behavior_data = df_filtered[variables]

# 将插补后的数据转换为DataFrame
behavior_data_imputed_df = pd.DataFrame(behavior_data, columns=behavior_data.columns)

# 将聚类列添加回插补后的数据
data_imputed = pd.concat([Group_col, behavior_data_imputed_df], axis=1)

results = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])

for var in variables:
    Group1 = data_imputed[data_imputed['Group'] == 0][var]  # Group I (0)
    Group2 = data_imputed[data_imputed['Group'] == 1][var]  # Group II (1)
    
    t_stat, p_value = ttest_ind(Group1, Group2)  # 进行比较
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 
        't-statistic': [t_stat], 
        'p-value': [p_value]
    })

    if not result.empty:
        results = pd.concat([results, result], ignore_index=True)

# 对所有变量的p值进行FDR校正
p_values = results['p-value'].astype(float)
_, p_values_corrected, _, _ = multipletests(p_values, method='fdr_bh')

# 将校正后的p值添加到结果表中
results['p-value (FDR corrected)'] = p_values_corrected.round(3)
results['p-value'] = results['p-value'].apply(lambda x: f"{x:.3f}")

# 打印三线表
print(results.to_string(index=False))
print('=' * len(results.to_string(index=False).split('\n')[0]))

Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  p-value (FDR corrected)
 AST/ALT       1.04±0.44        1.03±0.51     0.186108   0.853                    0.889
     LDH    146.20±26.66     155.09±24.61    -2.085747   0.039                    0.136
      CK     83.67±75.27    108.71±108.14    -1.592643   0.113                    0.265
      TG       1.46±0.68        1.60±0.77    -1.206831   0.229                    0.402
    CHOL       3.98±0.83        3.96±0.82     0.139884   0.889                    0.889
   HDL-C       1.22±0.30        1.10±0.22     2.927639   0.004                    0.028
   LDL-C       2.48±0.70        2.51±0.65    -0.317632   0.751                    0.889


## 电子病历

In [8]:
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet1')


continuous_vars = ['Dose Equivalent to Olanzapine (mg/d)', 'Frequency of episodes', 'Age of onset', 'Illness of duration (years)']
categorical_vars = ['First episode', 'Family history of psychiatric disorders']

# 过滤出Group列中值为1和2的行
df_filtered = df[df['Group'].isin([0, 1])]

results_continuous = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])
results_categorical = pd.DataFrame(columns=['Variable', 'Chi-square', 'p-value', 'p-value (FDR corrected)'])

p_values_continuous = []
p_values_categorical = []

for var in continuous_vars:
    Group1 = df_filtered[df_filtered['Group'] == 0][var]
    Group2 = df_filtered[df_filtered['Group'] == 1][var]
    

    t_stat, p_value = ttest_ind(Group1, Group2)
    p_values_continuous.append(p_value)
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({'Variable': [var], 'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"], 'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"], 't-statistic': [t_stat], 'p-value': [p_value]})
    results_continuous = pd.concat([results_continuous, result], ignore_index=True)

for var in categorical_vars:
    contingency_table = pd.crosstab(df_filtered['Group'], df_filtered[var])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    p_values_categorical.append(p_value)
    
    result = pd.DataFrame({'Variable': [var], 'Chi-square': [chi2], 'p-value': [p_value]})
    results_categorical = pd.concat([results_categorical, result], ignore_index=True)

# 对连续变量进行FDR BH矫正
_, p_values_continuous_corrected, _, _ = multipletests(p_values_continuous, method='fdr_bh')
results_continuous['p-value (FDR corrected)'] = p_values_continuous_corrected

# 对分类变量进行FDR BH矫正
_, p_values_categorical_corrected, _, _ = multipletests(p_values_categorical, method='fdr_bh')
results_categorical['p-value (FDR corrected)'] = p_values_categorical_corrected

results_continuous['p-value'] = results_continuous['p-value'].apply(lambda x: f"{x:.3f}")
results_categorical['p-value'] = results_categorical['p-value'].apply(lambda x: f"{x:.3f}")

results_continuous['p-value (FDR corrected)'] = results_continuous['p-value (FDR corrected)'].apply(lambda x: f"{x:.3f}")
results_categorical['p-value (FDR corrected)'] = results_categorical['p-value (FDR corrected)'].apply(lambda x: f"{x:.3f}")

print("Continuous Variables:")
print(results_continuous)
print("\nCategorical Variables:")
print(results_categorical)

Continuous Variables:
                               Variable Group I Mean±SD Group II Mean±SD  \
0  Dose Equivalent to Olanzapine (mg/d)      13.86±5.79       15.95±7.09   
1                 Frequency of episodes       6.51±4.30        5.99±4.50   
2                          Age of onset      27.58±9.18       27.13±7.80   
3           Illness of duration (years)      10.00±6.83        9.96±7.47   

   t-statistic p-value p-value (FDR corrected)  
0    -1.924265   0.056                   0.225  
1     0.708701   0.480                   0.959  
2     0.322084   0.748                   0.974  
3     0.032461   0.974                   0.974  

Categorical Variables:
                                  Variable  Chi-square p-value  \
0                            First episode    0.051907   0.820   
1  Family history of psychiatric disorders    0.174368   0.676   

  p-value (FDR corrected)  
0                   0.820  
1                   0.820  


## 人口学信息

In [9]:
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

# 读取数据
df = pd.read_excel('rawdata.xlsx', sheet_name='Sheet1')

continuous_vars = ['RPM', 'SES', 'Age', 'Education_years', 'BMI']
categorical_vars = ['Gender', 'Ethnic', 'Residence', 'Only_child', 'Smoking_status',
                    'Alcohol_consumption', 'Employed', 'Marital_status']

# 过滤出Group列中值为0和1的行
df_filtered = df[df['Group'].isin([0, 1])]

results_continuous = pd.DataFrame(columns=['Variable', 'Group I Mean±SD', 'Group II Mean±SD', 't-statistic', 'p-value', 'p-value (FDR corrected)'])
results_categorical = pd.DataFrame(columns=['Variable', 'Chi-square', 'p-value', 'p-value (FDR corrected)'])

p_values_continuous = []
p_values_categorical = []

# 连续变量分析
for var in continuous_vars:
    Group1 = df_filtered[df_filtered['Group'] == 0][var]
    Group2 = df_filtered[df_filtered['Group'] == 1][var]

    t_stat, p_value = ttest_ind(Group1, Group2)
    p_values_continuous.append(p_value)
    
    Group1_mean = Group1.mean()
    Group1_std = Group1.std()
    Group2_mean = Group2.mean()
    Group2_std = Group2.std()
    
    result = pd.DataFrame({
        'Variable': [var],
        'Group I Mean±SD': [f"{Group1_mean:.2f}±{Group1_std:.2f}"],
        'Group II Mean±SD': [f"{Group2_mean:.2f}±{Group2_std:.2f}"],
        't-statistic': [t_stat],
        'p-value': [p_value]
    })
    results_continuous = pd.concat([results_continuous, result], ignore_index=True)

# 分类变量分析
for var in categorical_vars:
    contingency_table = pd.crosstab(df_filtered['Group'], df_filtered[var])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    p_values_categorical.append(p_value)
    
    result = pd.DataFrame({
        'Variable': [var],
        'Chi-square': [chi2],
        'p-value': [p_value]
    })
    results_categorical = pd.concat([results_categorical, result], ignore_index=True)


# 对连续变量进行FDR BH矫正
_, p_values_continuous_corrected, _, _ = multipletests(p_values_continuous, method='fdr_bh')
results_continuous['p-value (FDR corrected)'] = p_values_continuous_corrected

# 对分类变量进行FDR BH矫正
_, p_values_categorical_corrected, _, _ = multipletests(p_values_categorical, method='fdr_bh')
results_categorical['p-value (FDR corrected)'] = p_values_categorical_corrected

# 格式化p值
results_continuous['p-value'] = results_continuous['p-value'].apply(lambda x: f"{x:.3f}")
results_categorical['p-value'] = results_categorical['p-value'].apply(lambda x: f"{x:.3f}")

results_continuous['p-value (FDR corrected)'] = results_continuous['p-value (FDR corrected)'].apply(lambda x: f"{x:.3f}")
results_categorical['p-value (FDR corrected)'] = results_categorical['p-value (FDR corrected)'].apply(lambda x: f"{x:.3f}")

# 打印结果
print("Continuous Variables:")
print(results_continuous)
print("\nCategorical Variables:")
print(results_categorical)


# 打印每个分类变量中每个类别的人数和百分比
for var in categorical_vars:
    contingency_table = pd.crosstab(df_filtered['Group'], df_filtered[var])
    total_count = contingency_table.sum().sum()
    
    print(f"\n{var} Counts and Percentages:")
    for group in contingency_table.index:
        for category in contingency_table.columns:
            count = contingency_table.loc[group, category]
            percentage = (count / total_count) * 100
            print(f"Group {group}, {category}: {count} ({percentage:.1f}%)")

Continuous Variables:
          Variable Group I Mean±SD Group II Mean±SD  t-statistic p-value  \
0              RPM     33.64±11.78      31.59±10.68     1.099809   0.273   
1              SES      26.79±8.31       20.23±5.62     5.632791   0.000   
2              Age     35.70±10.25       34.92±8.88     0.490173   0.625   
3  Education_years      11.31±4.82       10.51±4.29     1.057888   0.292   
4              BMI      24.28±3.79       23.40±3.57     1.441557   0.152   

  p-value (FDR corrected)  
0                   0.365  
1                   0.000  
2                   0.625  
3                   0.365  
4                   0.365  

Categorical Variables:
              Variable  Chi-square p-value p-value (FDR corrected)
0               Gender    4.758109   0.029                   0.047
1               Ethnic    7.371214   0.007                   0.018
2            Residence   23.699022   0.000                   0.000
3           Only_child    0.000000   1.000                   

## 数据分析全部完成