In [None]:
PROJECT_HOME = "."

# # For Colab

# PROJECT_HOME = "/content/drive/My Drive/Projects/LLM-MCI-detection"

# # Google Drive storage setup
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
import os
import re
import pandas as pd
from tqdm import tqdm

In [None]:
output_dir = os.path.join(PROJECT_HOME, "data")
os.makedirs(output_dir, exist_ok=True)

In [None]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [None]:
data = pd.read_csv(os.path.join(PROJECT_HOME, "Pitt", "PItt-data.csv"), skiprows=2)

sample_records = []
dx_data = data[['id', 'basedx', 'dx1', 'dx2', 'dx3', 'entryage', 'sex', 'race', 'educ', 'mms', 'mmse2', 'mmse3', 'mmse4']]

unique_subjects = {
    'AD': set(),
    'MCI': set(),
    'NC': set()
}

for idx, row in tqdm(dx_data.iterrows(), desc="Loading data", total=len(dx_data)):
    id_3_digits = str(int(row['id'])).zfill(3)
    for visit_number, visit_code in zip([0, 1, 2, 3], ['basedx', 'dx1', 'dx2', 'dx3']):

        dx_code = row[visit_code]
        if pd.isnull(dx_code):
            continue
        dx_code = str(int(dx_code))

        if dx_code in ['1', '100', '101']:
            label = "AD"
        elif dx_code in ['6', '7', '600', '610', '611', '720', '740']:
            label = "MCI"
        elif dx_code in ['8', '800', '821']:
            label = "NC"
        else:
            continue

        if visit_number == 0:
            mmse_column = "mms"
        elif visit_number == 1:
            mmse_column = "mmse2"
        elif visit_number == 2:
            mmse_column = "mmse3"
        elif visit_number == 3:
            mmse_column = "mmse4"

        transcript_file_name = "%s-%d.txt" % (id_3_digits, visit_number)
        transcript_path = os.path.join(PROJECT_HOME, "transcripts", transcript_file_name)

        if not os.path.exists(transcript_path):
            continue

        with open(transcript_path) as transcript_f:
            text = transcript_f.read().strip()
            if not isEnglish(text):
                text = re.sub(r'[^\x00-\x7F]+', ' ', text)

        unique_subjects[label].add(id_3_digits)

        sample_records.append({
            "label": label,
            "text": text,
            "age": row['entryage'],
            "gender": row['sex'],
            "race": row['race'],
            "education": row['educ'],
            "MMSE": row[mmse_column]
        })

sample_df = pd.DataFrame.from_records(sample_records)

In [None]:
print("# of NC subjects: %d" % len(unique_subjects['NC']))
print("# of MCI subjects: %d" % len(unique_subjects['MCI']))
print("# of AD subjects: %d" % len(unique_subjects['AD']))

In [None]:
print("# of NC samples: %d" % len(sample_df[sample_df['label']=='NC']))
print("# of MCI samples: %d" % len(sample_df[sample_df['label']=='MCI']))
print("# of AD samples: %d" % len(sample_df[sample_df['label']=='AD']))

In [None]:
sample_df.to_csv(os.path.join(PROJECT_HOME, 'data', 'original.csv'), index=False)