In [1]:
'''
patientCharacteristicAnalysis V01
Version Discription: dataset analysis using Kruskal-Wallis Test & Chi-square Test
Last Update: 20240115
'''

'\npatientCharacteristicAnalysis V01\nVersion Discription: dataset analysis using Kruskal-Wallis Test & Chi-square Test\nLast Update: 20240115\n'

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import kruskal, chi2_contingency
import pickle

In [13]:
def data_loader(filename):
    df = pd.read_csv("ICH_Data.csv")
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    
    ##### Newer version of pandas doesn't support df.append. #####
    new_df = pd.DataFrame(columns=df.columns)
    for idx, tesnor_, label in data:
        if label == 1:
            new_df = new_df.append(df[df['去識別化編號'] == idx+1])
            # new_df = pd.concat([new_df, df[df['去識別化編號'] == idx+1]])
    return new_df
    
    ##### New Way #####
    # Initialize a list to store DataFrames to concatenate
    # rows_to_concat = []
    # for idx, tesnor_, label in data:
    #     rows_to_concat.append(df[df['去識別化編號'] == str(int(idx)+1)])
    # if rows_to_concat:
    #     new_df = pd.concat(dfs_to_concat, ignore_index=True)
    #     return new_df
    # else:
    #     print(f'Now data available')


In [14]:
# Load data from files
train_set = '20240110_1713_train_dataset.pkl'
val_set = '20240110_1713_val_dataset.pkl'
test_set = '20240110_1713_test_dataset.pkl'

# train_df = data_loader(train_set)
# validation_df = data_loader(val_set)
test_df = data_loader(test_set)
# combined_ICH = pd.concat([train_df, validation_df, test_df], ignore_index=True).sort_values(by='去識別化編號', ascending=True)
# combined_ICH
test_df

Unnamed: 0,去識別化編號,年月,心電圖有無,姓名,病歷編號,缺血,TIA,出血,無中風,年齡,...,檢傷SBP,檢傷DBP,檢傷HR,"rhythm(SR=0, Afib=1, aFLUTTER=2, pacing=3, junctional=4, others=5)","Bunddle branch block(non=0, RBBB=1, LBBB=2, LAFB=3, LPFB=4, other=5)",ventricular rate,PR interval,QRS duration,QTC,qrs axis
8,186,21-Jul,1,駱李阿色,22256413,,,1,,90,...,237.0,109.0,84.0,0,0,81,168.0,98,460,29
103,1054,19-Jun,1,何木榮,38657362,,,1,,54,...,246.0,125.0,51.0,0,0,50,180.0,88,405,59
137,1331,18-Oct,1,徐趙寶貴,18235499,,,1,,60,...,235.0,112.0,102.0,0,0,101,176.0,84,493,8
1,75,21-Nov,1,郭若瑟,38983221,,,1,,70,...,216.0,96.0,78.0,0,0,76,184.0,90,459,36
25,366,21-Jan,1,羅明宏,43065425,,,1,,71,...,138.0,60.0,74.0,0,0,82,149.0,104,542,-9
62,673,20-May,1,陳虹君,43017681,,,1,,38,...,199.0,104.0,86.0,0,0,68,144.0,94,447,37
72,773,20-Feb,1,喬淑梅,46113079,,,1,,53,...,182.0,105.0,63.0,0,0,59,176.0,106,468,-2
115,1179,19-Mar,1,楊黃含春,34846496,,,1,,83,...,256.0,125.0,108.0,0,0,109,164.0,72,458,-15
46,529,20-Sep,1,謝進卿,43440717,,,1,,79,...,132.0,81.0,94.0,0,0,84,204.0,88,450,-39
139,1351,18-Oct,1,李培忠,4630619,,,1,,95,...,230.0,95.0,56.0,4,0,58,,102,492,-52


In [20]:
df = pd.read_csv("label_sheet.csv")
df[df['去識別化編號'] == 1331]

Unnamed: 0.1,Unnamed: 0,去識別化編號,年月,心電圖有無,姓名,病歷編號,缺血,TIA,出血,無中風,...,"Bunddle branch block(non=0, RBBB=1, LBBB=2, LAFB=3, LPFB=4, other=5)",ventricular rate,PR interval,QRS duration,QTC,qrs axis,files,files_after_PID,files_after_date,files_after time
1330,1330,1331,Oct-18,1.0,徐趙寶貴,18235499,,,1,,...,0,101.0,176.0,84.0,493.0,8.0,,['PageWriterTouchECG2018105143725859.svg'],['PageWriterTouchECG2018105143725859.svg'],['PageWriterTouchECG2018105143725859.svg']


In [8]:
# Save DataFrame "combined_ICH"
# combined_ICH.to_csv('dataset/combined_ICH.csv', index=False)

In [9]:
# Columns for Kruskal-Wallis Test
columns_to_test_kruskal = ['年齡', 'onset time to door', '執行電腦斷層距離到院時間(分鐘)', 'door to 心電圖費時', '檢傷SBP', '檢傷DBP', '檢傷HR', 'ventricular rate', 'PR interval', 'QRS duration', 'QTC', 'qrs axis']

# Perform Kruskal-Wallis Test
kruskal_results = {}
for column in columns_to_test_kruskal:
    if column in train_df.columns and column in validation_df.columns and column in test_df.columns:
        stat, p_value = kruskal(train_df[column], validation_df[column], test_df[column])
        kruskal_results[column] = p_value
    else:
        print(f"Kruskal-Wallis: Column {column} not found in all dataframes")
        
# Print Kruskal-Wallis results
print("Kruskal-Wallis Test Results:")
for characteristic, p_value in kruskal_results.items():
    print(f"{characteristic}: {'Different distributions' if p_value < 0.05 else 'Similar distributions'} (p-value: {p_value})")

Kruskal-Wallis Test Results:
年齡: Similar distributions (p-value: 0.5022910203718699)
onset time to door: Similar distributions (p-value: 0.8577522798072414)
執行電腦斷層距離到院時間(分鐘): Similar distributions (p-value: nan)
door to 心電圖費時: Similar distributions (p-value: 0.9678103525777492)
檢傷SBP: Similar distributions (p-value: nan)
檢傷DBP: Similar distributions (p-value: nan)
檢傷HR: Similar distributions (p-value: nan)
ventricular rate: Similar distributions (p-value: 0.2837950786478635)
PR interval: Similar distributions (p-value: nan)
QRS duration: Similar distributions (p-value: 0.8792847826165547)
QTC: Similar distributions (p-value: 0.42400867454557556)
qrs axis: Similar distributions (p-value: 0.46102552735119473)


In [10]:
import itertools

def perform_chi2_test(df1, df2, df1_name, df2_name, column):
    # Combine the data from the two dataframes
    combined_df = pd.concat([
        df1[[column]].assign(dataset=df1_name),
        df2[[column]].assign(dataset=df2_name)
    ])

    # Create a contingency table
    contingency_table = pd.crosstab(index=combined_df[column], columns=combined_df['dataset'])

    # Perform the Chi-square test if the table is not empty
    if not contingency_table.empty:
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        return p_value
    else:
        return None

In [11]:
# Columns for Chi-square Test
columns_to_test_chi2 = ['性別', 'rhythm(SR=0, Afib=1, aFLUTTER=2, pacing=3, junctional=4, others=5)', 'Bunddle branch block(non=0, RBBB=1, LBBB=2, LAFB=3, LPFB=4, other=5)']

# Dataframe pairs
dataframes = [('train', train_df), ('validation', validation_df), ('test', test_df)]

# Pairwise Chi-square Tests
chi2_results = {}
for column in columns_to_test_chi2:
    for (df1_name, df1), (df2_name, df2) in itertools.combinations(dataframes, 2):
        p_value = perform_chi2_test(df1, df2, df1_name, df2_name, column)
        if p_value is not None:
            key = f"{column} ({df1_name} vs {df2_name})"
            chi2_results[key] = p_value
        else:
            print(f"Chi-square: Empty contingency table for column {column} in {df1_name} vs {df2_name}")

# Print Chi-square results
print("\nChi-square Test Results:")
for characteristic, p_value in chi2_results.items():
    print(f"{characteristic}: {'Significant association' if p_value < 0.05 else 'No significant association'} (p-value: {p_value})")


Chi-square Test Results:
性別 (train vs validation): No significant association (p-value: 0.6955673052635036)
性別 (train vs test): No significant association (p-value: 0.4424757650615364)
性別 (validation vs test): No significant association (p-value: 0.3603410063766622)
rhythm(SR=0, Afib=1, aFLUTTER=2, pacing=3, junctional=4, others=5) (train vs validation): No significant association (p-value: 0.1098588756857652)
rhythm(SR=0, Afib=1, aFLUTTER=2, pacing=3, junctional=4, others=5) (train vs test): No significant association (p-value: 0.26282522755904864)
rhythm(SR=0, Afib=1, aFLUTTER=2, pacing=3, junctional=4, others=5) (validation vs test): No significant association (p-value: 0.13464105988442565)
Bunddle branch block(non=0, RBBB=1, LBBB=2, LAFB=3, LPFB=4, other=5) (train vs validation): No significant association (p-value: 0.6593636470552597)
Bunddle branch block(non=0, RBBB=1, LBBB=2, LAFB=3, LPFB=4, other=5) (train vs test): No significant association (p-value: 0.7618396662400171)
Bund