### Load Packages

In [1]:
import pandas as pd
from scipy.stats import chi2_contingency, ks_2samp, levene, ttest_ind, mannwhitneyu, shapiro, f_oneway, kruskal

### 2 Class Analysis

In [5]:
import pandas as pd
from scipy.stats import chi2_contingency, ks_2samp, levene, ttest_ind, mannwhitneyu


df = pd.read_csv('../subject_finalized.csv')


classes = df['class'].unique()
print(f'Classes: {classes}')


comparisons = [('MDD', 'BD'), ('MDD', 'HC'), ('BD', 'HC')]


for class1, class2 in comparisons:
    print(f'\nComparing {class1} vs {class2}')
    

    group1 = df[df['class'] == class1]
    group2 = df[df['class'] == class2]
    

    stats1 = {
        'mean_age': group1['age'].mean(),
        'std_age': group1['age'].std(),
        'male_count': group1['sex'].value_counts().get(1, 0),
        'female_count': group1['sex'].value_counts().get(2, 0)
    }
    stats2 = {
        'mean_age': group2['age'].mean(),
        'std_age': group2['age'].std(),
        'male_count': group2['sex'].value_counts().get(1, 0),
        'female_count': group2['sex'].value_counts().get(2, 0)
    }
    
    print(f'Statistics for {class1}: Mean Age = {stats1["mean_age"]:.2f}, SD = {stats1["std_age"]:.2f}, '
          f'Males = {stats1["male_count"]}, Females = {stats1["female_count"]}')
    print(f'Statistics for {class2}: Mean Age = {stats2["mean_age"]:.2f}, SD = {stats2["std_age"]:.2f}, '
          f'Males = {stats2["male_count"]}, Females = {stats2["female_count"]}')
    
    # sex comparison (Chi-squared test)
    sex_counts1 = group1['sex'].value_counts()
    sex_counts2 = group2['sex'].value_counts()
  
    sex_table = pd.DataFrame({
        class1: [sex_counts1.get(1, 0), sex_counts1.get(2, 0)],
        class2: [sex_counts2.get(1, 0), sex_counts2.get(2, 0)]
    }, index=['Male', 'Female'])
    
    if sex_table.size == 0 or sex_table.isnull().values.any():
        print(f'Empty or missing when comparing between {class1} and {class2}')
    else:
        chi2, p_chi, _, _ = chi2_contingency(sex_table)
        print(f'Chi-squared test for sex: chi2 = {chi2:.2f}, p-value = {p_chi:.4f}')

    # Kolmogorov-Smirnov test
    age1 = group1['age']
    age2 = group2['age']
    ks_stat, p_ks = ks_2samp(age1, age2)
    print(f'Kolmogorov-Smirnov test for age: KS-statistic = {ks_stat:.2f}, p-value = {p_ks:.4f}')
    
    # Levene’s test
    lev_stat, p_lev = levene(age1, age2)
    print(f'Levene’s test: Levene-statistic = {lev_stat:.2f}, p-value = {p_lev:.4f}')


    if p_ks > 0.05 and p_lev > 0.05:
        # use Student’s t-test
        t_stat, p_t = ttest_ind(age1, age2)
        print(f'Student’s t-test for age: t-statistic = {t_stat:.2f}, p-value = {p_t:.4f}')
    else:
        # use Mann Whitney U test
        u_stat, p_u = mannwhitneyu(age1, age2, alternative='two-sided')
        print(f'Mann Whitney U test for age: U-statistic = {u_stat:.2f}, p-value = {p_u:.4f}')


Classes: ['BD' 'HC' 'MDD']

Comparing MDD vs BD
Statistics for MDD: Mean Age = 38.25, SD = 15.18, Males = 22, Females = 43
Statistics for BD: Mean Age = 34.89, SD = 11.18, Males = 21, Females = 43
Chi-squared test for sex: chi2 = 0.00, p-value = 1.0000
Kolmogorov-Smirnov test for age: KS-statistic = 0.24, p-value = 0.0306
Levene’s test: Levene-statistic = 9.42, p-value = 0.0026
Mann Whitney U test for age: U-statistic = 2264.00, p-value = 0.3870

Comparing MDD vs HC
Statistics for MDD: Mean Age = 38.25, SD = 15.18, Males = 22, Females = 43
Statistics for HC: Mean Age = 37.69, SD = 9.99, Males = 29, Females = 42
Chi-squared test for sex: chi2 = 0.44, p-value = 0.5061
Kolmogorov-Smirnov test for age: KS-statistic = 0.33, p-value = 0.0008
Levene’s test: Levene-statistic = 18.28, p-value = 0.0000
Mann Whitney U test for age: U-statistic = 2124.50, p-value = 0.4263

Comparing BD vs HC
Statistics for BD: Mean Age = 34.89, SD = 11.18, Males = 21, Females = 43
Statistics for HC: Mean Age = 37.

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, ks_2samp, levene, ttest_ind, mannwhitneyu


df = pd.read_csv('../subject_finalized.csv')


classes = df['class'].unique()
print(f'Classes: {classes}')


comparisons = [('MDD', 'BD'), ('MDD', 'HC'), ('BD', 'HC')]

for class1, class2 in comparisons:
    print(f'\nComparing {class1} vs {class2}')
    

    group1 = df[df['class'] == class1]
    group2 = df[df['class'] == class2]
    

    stats1 = {
        'mean_age': group1['age'].mean(),
        'std_age': group1['age'].std(),
        'male_count': group1['sex'].value_counts().get(1, 0),
        'female_count': group1['sex'].value_counts().get(2, 0)
    }
    stats2 = {
        'mean_age': group2['age'].mean(),
        'std_age': group2['age'].std(),
        'male_count': group2['sex'].value_counts().get(1, 0),
        'female_count': group2['sex'].value_counts().get(2, 0)
    }
    
    print(f'Statistics for {class1}: Mean Age = {stats1["mean_age"]:.2f}, SD = {stats1["std_age"]:.2f}, '
          f'Males = {stats1["male_count"]}, Females = {stats1["female_count"]}')
    print(f'Statistics for {class2}: Mean Age = {stats2["mean_age"]:.2f}, SD = {stats2["std_age"]:.2f}, '
          f'Males = {stats2["male_count"]}, Females = {stats2["female_count"]}')
    

    # 性別比較：使用卡方檢定
    sex_counts1 = group1['sex'].value_counts()
    sex_counts2 = group2['sex'].value_counts()
  
    sex_table = pd.DataFrame({
        class1: [sex_counts1.get(1, 0), sex_counts1.get(2, 0)],
        class2: [sex_counts2.get(1, 0), sex_counts2.get(2, 0)]
    }, index=['Male', 'Female'])
    
    if sex_table.size == 0 or sex_table.isnull().values.any():
        print(f'Empty or missing when comparing between {class1} and {class2}')
    else:
        chi2, p_chi, _, _ = chi2_contingency(sex_table)
        print(f'Chi-squared test for sex: chi2 = {chi2:.2f}, p-value = {p_chi:.4f}')
        
        # 計算 Cramér's V 作為卡方檢定效應大小
        n = sex_table.values.sum()
        min_dim = min(sex_table.shape) - 1  # 對於2x2表，min_dim = 1
        cramers_v = np.sqrt(chi2 / (n * min_dim))
        print(f"Cramér's V for sex: {cramers_v:.4f}")



    # 年齡資料檢定
    age1 = group1['age']
    age2 = group2['age']
    
    # Kolmogorov-Smirnov 檢定：檢查兩組分佈是否相似
    ks_stat, p_ks = ks_2samp(age1, age2)
    print(f'Kolmogorov-Smirnov test for age: KS-statistic = {ks_stat:.2f}, p-value = {p_ks:.4f}')
    
    # Levene’s 檢定：檢查變異數齊性
    lev_stat, p_lev = levene(age1, age2)
    print(f'Levene’s test: Levene-statistic = {lev_stat:.2f}, p-value = {p_lev:.4f}')

    # 根據正態性與變異數齊性結果選擇適當檢定方法
    if p_ks > 0.05 and p_lev > 0.05:
        # 使用 Student’s t-test
        t_stat, p_t = ttest_ind(age1, age2)
        print(f"Student’s t-test for age: t-statistic = {t_stat:.2f}, p-value = {p_t:.4f}")
        
        # 計算 Cohen's d 作為 t-test 的效應大小
        n1 = len(age1)
        n2 = len(age2)
        s1 = age1.std(ddof=1)
        s2 = age2.std(ddof=1)
        pooled_std = np.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2))
        cohen_d = (age1.mean() - age2.mean()) / pooled_std
        print(f"Cohen's d for age: {cohen_d:.4f}")
    else:
        # 使用 Mann Whitney U test
        u_stat, p_u = mannwhitneyu(age1, age2, alternative='two-sided')
        print(f'Mann Whitney U test for age: U-statistic = {u_stat:.2f}, p-value = {p_u:.4f}')
        
        # 計算 Rank Biserial Correlation 作為 Mann Whitney U 的效應大小
        n1 = len(age1)
        n2 = len(age2)
        rank_biserial = 1 - (2 * u_stat) / (n1 * n2)
        print(f"Rank Biserial Correlation for age: {rank_biserial:.4f}")


Classes: ['BD' 'HC' 'MDD']

Comparing MDD vs BD
Statistics for MDD: Mean Age = 38.25, SD = 15.18, Males = 22, Females = 43
Statistics for BD: Mean Age = 34.89, SD = 11.18, Males = 21, Females = 43
Chi-squared test for sex: chi2 = 0.00, p-value = 1.0000
Cramér's V for sex: 0.0000
Kolmogorov-Smirnov test for age: KS-statistic = 0.24, p-value = 0.0306
Levene’s test: Levene-statistic = 9.42, p-value = 0.0026
Mann Whitney U test for age: U-statistic = 2264.00, p-value = 0.3870
Rank Biserial Correlation for age: -0.0885

Comparing MDD vs HC
Statistics for MDD: Mean Age = 38.25, SD = 15.18, Males = 22, Females = 43
Statistics for HC: Mean Age = 37.69, SD = 9.99, Males = 29, Females = 42
Chi-squared test for sex: chi2 = 0.44, p-value = 0.5061
Cramér's V for sex: 0.0570
Kolmogorov-Smirnov test for age: KS-statistic = 0.33, p-value = 0.0008
Levene’s test: Levene-statistic = 18.28, p-value = 0.0000
Mann Whitney U test for age: U-statistic = 2124.50, p-value = 0.4263
Rank Biserial Correlation for 

: 

### 3 Class Analysis

In [6]:
df = pd.read_csv('../subject_finalized.csv')

required_columns = ['class', 'sex', 'age']
if not all(column in df.columns for column in required_columns):
    print("Could not found columns：'class', 'sex', 'age'")
else:
    group_MDD = df[df['class'] == 'MDD']['age']
    group_BD = df[df['class'] == 'BD']['age']
    group_HC = df[df['class'] == 'HC']['age']

    if group_MDD.empty or group_BD.empty or group_HC.empty:
        print("Empty data for one or more groups.")
    else:
        all_ages = pd.concat([group_MDD, group_BD, group_HC])

        ks_stat, p_ks = ks_2samp(group_MDD, group_BD)
        ks_stat2, p_ks2 = ks_2samp(group_MDD, group_HC)
        ks_stat3, p_ks3 = ks_2samp(group_BD, group_HC)
        print(f'Kolmogorov-Smirnov test for normality between MDD and BD: KS-statistic = {ks_stat}, p-value = {p_ks}')
        print(f'Kolmogorov-Smirnov test for normality between MDD and HC: KS-statistic = {ks_stat2}, p-value = {p_ks2}')
        print(f'Kolmogorov-Smirnov test for normality between BD and HC: KS-statistic = {ks_stat3}, p-value = {p_ks3}')


        lev_stat, p_lev = levene(group_MDD, group_BD, group_HC)
        print(f'Levene’s test for homoscedasticity: Levene-statistic = {lev_stat}, p-value = {p_lev}')


        if p_ks > 0.05 and p_ks2 > 0.05 and p_ks3 > 0.05 and p_lev > 0.05:
            # use ANOVA
            anova_stat, p_anova = f_oneway(group_MDD, group_BD, group_HC)
            print(f'ANOVA test: F-statistic = {anova_stat}, p-value = {p_anova}')
        else:
            # use Kruskal-Wallis H test
            kw_stat, p_kw = kruskal(group_MDD, group_BD, group_HC)
            print(f'Kruskal-Wallis H test: H-statistic = {kw_stat}, p-value = {p_kw}')


Kolmogorov-Smirnov test for normality between MDD and BD: KS-statistic = 0.24375, p-value = 0.030644587376791994
Kolmogorov-Smirnov test for normality between MDD and HC: KS-statistic = 0.3308775731310943, p-value = 0.0008131353492979057
Kolmogorov-Smirnov test for normality between BD and HC: KS-statistic = 0.22799295774647887, p-value = 0.04853633720796107
Levene’s test for homoscedasticity: Levene-statistic = 10.592748167155916, p-value = 4.271439921751591e-05
Kruskal-Wallis H test: H-statistic = 3.0544533120722437, p-value = 0.2171370287074705


### Assessment Score

In [4]:
df = pd.read_csv('../subject_finalized.csv')


MDD_BDI2 = df[df['class'] == 'MDD']['BDI2']
BD_HAMD = df[df['class'] == 'BD']['HAMD']
BD_YMRS = df[df['class'] == 'BD']['YMRS']
HC_BDI2 = df[df['class'] == 'HC']['BDI2']


print(f'MDD group: BDI2 mean = {MDD_BDI2.mean():.2f}, BDI2 std = {MDD_BDI2.std():.2f}')
print(f'HC group: BDI2 mean = {HC_BDI2.mean():.2f}, BDI2 std = {HC_BDI2.std():.2f}')
print(f'BD group: HAMD mean = {BD_HAMD.mean():.2f}, HAMD std = {BD_HAMD.std():.2f}')
print(f'BD group: YMRS mean = {BD_YMRS.mean():.2f}, YMRS std = {BD_YMRS.std():.2f}')



MDD group: BDI2 mean = 28.71, BDI2 std = 13.23
HC group: BDI2 mean = 2.55, BDI2 std = 7.01
BD group: HAMD mean = 10.73, HAMD std = 6.08
BD group: YMRS mean = 4.90, YMRS std = 3.99
