In [None]:
import os
import json
import pandas as pd

folders = [r'\\YOUR_ROOT\Stroke', r'\\YOUR_ROOT\Cerebral_Palsy', r'\\YOUR_ROOT\Peripheral_Neuropathy']

all_data = []

for folder in folders:
    folder_path = os.path.join(os.getcwd(), folder)
    if not os.path.isdir(folder_path):
        print(f"No Root: {folder_path}")
        continue
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    data['folder'] = folder
                    data['file_name'] = file_name
                    all_data.append(data)
            except Exception as e:
                print(f"{file_path} Failed: {e}")

try:
    df = pd.DataFrame(all_data)
    display(df.head())
except Exception as e:
    print(f"Failed: {e}")

In [None]:
folder_to_disease = {
    r'\\YOUR_ROOT\Stroke': 0,
    r'\\YOUR_ROOT\Cerebral_Palsy': 1,
    r'\\YOUR_ROOT\Peripheral_Neuropathy': 2
}

df['Disease'] = df['folder'].map(folder_to_disease)

display(df[['folder', 'Disease']].drop_duplicates())


In [None]:
print(df['Disease'].value_counts().sort_index())

In [None]:
print(df.columns.tolist())

In [None]:
columns_to_drop = ['Disease_info', 'Meta_info', 'FileSize','file_name','folder','playTime','Test_info']

df = df.drop(columns=columns_to_drop)

print(df.columns)

In [None]:
for col in df.columns:
    first_valid = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
    
    if isinstance(first_valid, dict):
        print(f"\n {col}")
        expanded = df[col].apply(pd.Series)
        for sub_col in expanded.columns:
            print(f"{sub_col}")
            print(expanded[sub_col].unique()[:10])
    else:
        print(f"\n{col}")
        print(df[col].unique()[:10])

In [None]:
patient_info_df = df['Patient_info'].apply(pd.Series)

df['Sex'] = patient_info_df.get('Sex')
df['Age'] = patient_info_df.get('Age')
df['Area'] = patient_info_df.get('Area')

display(df[['Sex', 'Age', 'Area']].head())

In [None]:
df['Initial'] = df['File_id'].str.split('-').str[4]
print(df.Initial)

In [None]:
print(df.columns.tolist())

In [None]:
columns_to_drop = ['Patient_info']

df = df.drop(columns=columns_to_drop)

print(df.columns)

In [None]:
initial_counts = df['Initial'].value_counts()
print(initial_counts)

In [None]:
duplicates = initial_counts[initial_counts > 1]
print(f"duplicated Initial: {len(duplicates)}")
print(duplicates)

In [None]:
compare_cols = ['Disease', 'Sex', 'Age', 'Area']

conflicting_initials = (
    df.groupby('Initial')[compare_cols]
    .nunique()
    .apply(lambda row: any(row > 1), axis=1)
)

conflict_ids = conflicting_initials[conflicting_initials].index.tolist()

conflict_rows = df[df['Initial'].isin(conflict_ids)]

print(f"Initial: {len(conflict_ids)}")
display(conflict_rows.sort_values(by='Initial')[['Initial'] + compare_cols])

In [None]:
for file_id in df['File_id']:
    print(file_id)
print(f"file_id: {len(df['File_id'])}")

In [None]:
import pandas as pd

pd.set_option('display.max_rows', 10)

display(df[['File_id', 'Area']])

In [None]:
id_columns = ['Initial', 'Disease', 'Sex', 'Age', 'Area']

df['ID_num'] = (
    df.groupby(id_columns, sort=False)
    .ngroup()
    .apply(lambda x: f'P{str(x+1).zfill(3)}')  # P001, P002
)

In [None]:
df = df[['ID_num', 'Initial', 'Sex', 'Age', 'Area','Disease', 'File_id']]

In [None]:
pd.set_option('display.max_rows', 10)

df

In [None]:
pd.set_option('display.max_rows', 10)
initial_id_counts = df.groupby('Initial')['ID_num'].nunique()

ambiguous_initials = initial_id_counts[initial_id_counts > 1].index.tolist()

confused_rows = df[df['Initial'].isin(ambiguous_initials)].sort_values(by=['Initial', 'ID_num'])

print(f"🌀 Same Initial but ID_num different: {len(ambiguous_initials)}")
display(confused_rows)


In [None]:
for col in df.columns:
    first_valid = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
    
    if isinstance(first_valid, dict):
        print(f"\n{col}")
        expanded = df[col].apply(pd.Series)
        for sub_col in expanded.columns:
            print(f"{sub_col}")
            print(expanded[sub_col].unique()[:10])
    else:
        print(f"\n{col}")
        print(df[col].unique()[:10])

In [None]:
df['Sex'] = df['Sex'].map({'M': 0, 'F': 1})

area_map = {
    'KK': 0, 'kk': 0,
    'SU': 1,
    'GW': 2,
    'JJ': 3,
    'JL': 4,
    'GS': 5,
    'CC': 6
}
df['Area'] = df['Area'].map(area_map)

print(df.head())

In [None]:
cols = ['ID_num', 'Initial', 'Sex', 'Age', 'Area', 'Disease', 'File_id']

merged_df = (
    df[cols]
    .groupby(['ID_num', 'Initial', 'Sex', 'Age', 'Area', 'Disease'])['File_id']
    .apply(lambda x: ','.join(sorted(x)))
    .reset_index()
)

merged_df = merged_df.rename(columns={'File_id': 'Merged_File_ids'})

print(merged_df.head())

In [None]:
merged_df.to_csv('preprocessing.csv', index=False)