In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mne
import pandas as pd
from braindecode.datasets import TUHAbnormal, BaseConcatDataset
from braindecode.preprocessing import (
    preprocess, Preprocessor, create_fixed_length_windows, create_windows_from_events, scale as multiply)
import torch
from braindecode.util import set_random_seeds

from braindecode.models import ShallowFBCSPNet, deep4
from skorch.callbacks import LRScheduler
from skorch.helper import predefined_split
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer
from braindecode import EEGClassifier


mne.set_log_level('ERROR')  # avoid messages everytime a window is extracted

TUHAbnormal_PATH = '/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2.0.0'
N_JOBS = 8  # specify the number of jobs for loading and windowing
N_SAMPLES = 1

tuh = TUHAbnormal(
    path=TUHAbnormal_PATH,
    #recording_ids=list(range(N_SAMPLES)),
    target_name=('report','pathological'),
    preload=False,
    add_physician_reports=True,
    n_jobs=N_JOBS, 
)

print("length of dataset : ", len(tuh))


In [10]:
df = tuh.description

In [14]:
df.to_csv("tuh_description.csv")

In [12]:
df[df.duplicated(["subject"],keep=False)]

Unnamed: 0,path,year,month,day,subject,session,segment,age,gender,report,version,train,pathological
2,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,4,929,3,2,39,F,"CLINICAL HISTORY: Epilepsy, currently seizure...",v2.0.0,True,False
4,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,9,5909,2,0,32,M,CLINICAL HISTORY: 32 year old male with episod...,v2.0.0,True,False
7,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,11,5928,1,0,70,F,CLINICAL HISTORY: 71 year old woman with recu...,v2.0.0,True,True
8,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,15,4526,3,1,71,F,\nCLINICAL HISTORY: 71 year old woman with epi...,v2.0.0,False,True
10,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,16,5931,1,1,69,M,\n\nHISTORY: 69 year old male with psychiatric...,v2.0.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2974,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,9,25,10033,3,0,64,M,CLINICAL HISTORY: A 64-year-old male status p...,v2.0.0,True,True
2983,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,10,4,10782,1,1,85,F,CLINICAL HISTORY: 84-year-old woman with 3-ves...,v2.0.0,False,True
2986,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,10,5,10782,2,1,85,F,CLINICAL HISTORY: 84-year-old woman with 3-ve...,v2.0.0,False,True
2987,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,10,7,6091,4,0,40,M,"CLINICAL 40-year-old man With LD, Childhood TB...",v2.0.0,False,True


In [64]:
categoy_pattern = r"^([A-Z\s]{2,}):{1}"
content_pattern = r":(.*)"
def remove_exccessive_white_space(string):
    replaces = re.sub(r'\s+', ' ', string)
    return replaces.strip()

In [72]:
df = tuh.description

In [74]:
def extract_categories(content):
    #content = b'\n'.join([line.strip() for line in content]).strip().decode("latin-1")
    categories = re.findall(categoy_pattern, content, re.MULTILINE)   
    assert len(categories) > 0, "no categories found"

    df_row = {}
    
    
    # go through all subsequent pairs of categories, extract text inbetween and assign it to start category
    for j in range(len(categories) - 1):
        start = categories[j]
        stop = categories[j + 1]
        match = re.findall(start + content_pattern + stop, content, re.DOTALL)
        assert len(match) == 1, "found more than one match!"
        # remove multiple spaces and newlines
        start = ' '.join(start.split())
        df_row.update({start: remove_exccessive_white_space(match[0])})
        
    # take all text that appears after last category and assign 
    match = re.findall(stop + content_pattern, content, re.DOTALL)
    assert len(match) == 1, "found more than one match!"
    # remove multiple spaces and newlines
    stop = ' '.join(stop.split())
    df_row.update({stop: remove_exccessive_white_space(match[0])})
    return df_row

In [75]:
categories = df['report'].apply(extract_categories)

In [76]:
df = pd.json_normalize(categories)
df = df.join(categories)

In [77]:
df

Unnamed: 0,CLINICAL HISTORY,MEDICATIONS,INTRODUCTION,DESCRIPTION OF THE RECORD,IMPRESSION,CLINICAL CORRELATION,CLINICAL INTERPRETATION,HR,HISTORY,EKG,...,SEIZURES OR PUSHBUTTON EVENTS,TECHNICAL CONSIDERATIONS,FEATURES,RECOMMENDATIONS,TOTAL LENGTH OF THE RECORDING,REASON,DESCRIPTION RECORD,CLINICAL CORRELATE,DATE OF STUDY,report
0,Seizures.,"Dilantin, Lipitor.",Digital video EEG is performed in the lab usin...,"In wakefulness, there is a 9.8 Hz alpha rhythm...",Normal electroencephalogram.,This is the third normal EEG for this individu...,,,,,...,,,,,,,,,,"{'CLINICAL HISTORY': 'Seizures.', 'MEDICATIONS..."
1,"Schizophrenia, memory loss.","Geodon, simvastatin, benztropine, Norvasc.",Digital video EEG is performed in the lab usin...,"In wakefulness, background EEG is well organiz...",Normal EEG.,This prolonged outpatient sleep deprived EEG w...,,,,,...,,,,,,,,,,"{'CLINICAL HISTORY': 'Schizophrenia, memory lo..."
2,"Epilepsy, currently seizure-free.","Lamictal, Keppra.",A 38-year-old right-handed woman with epilepsy...,Digital video EEG is performed using standard ...,EEG within normal limits but excessively drows...,No epileptiform features are noted. The excess...,,,,,...,,,,,,,,,,"{'CLINICAL HISTORY': 'Epilepsy, currently seiz..."
3,Sixty-five-year-old woman with a previous eval...,Norvasc.,Digital video EEG is performed in the lab usin...,The background EEG is appropriately organized ...,Normal EEG in wakefulness.,,No focal nor epileptiform features are identif...,,,,...,,,,,,,,,,{'CLINICAL HISTORY': 'Sixty-five-year-old woma...
4,32 year old male with episodes of bilateral ha...,Depakote,Digital video EEG was performed in lab using s...,In wakefulness there is a 9.5 Hz posterior dom...,Normal EEG,No focal nor epileptiform features were identi...,,66 bpm,,,...,,,,,,,,,,{'CLINICAL HISTORY': '32 year old male with ep...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,50-year-old woman with a history of light MCA ...,LEV,Digital video EEG was performed in the lab usi...,,This Is an abnormal EEG due to: 1. A somewhat ...,This EEG supports 8 complex epileptiform proce...,,,,,...,,,,,,,,,,{'CLINICAL HISTORY': '50-year-old woman with a...
2989,"A 38-year-old woman, with unclear epilepsy his...","Tegretol, Depakote, others.",Digital video EEG is performed in the lab usin...,"In wakefulness, there is a 10.5 Hz alpha rhyth...",Normal electroencephalogram in wakefulness and...,No focal or epileptiform features were identif...,,,,,...,,,,,,,,,,"{'CLINICAL HISTORY': 'A 38-year-old woman, wit..."
2990,"A 22-year-old with refractory epilepsy, 13 sei...",Keppra.,Digital video EEG is performed in he lab using...,"In wakefulness, there is a 10Hz alpha rhythm w...",Abnormal EEG due to: 1. Right temporal focal s...,,,,,,...,,,,,,,,,,{'CLINICAL HISTORY': 'A 22-year-old with refra...
2991,A 55-year-old woman with restless legs syndrom...,"Keppra, Zoloft, Requip, insulin, others.",Digital video EEG was performed in the lab usi...,The background EEG demonstrates an awake patte...,This is a normal EEG primarily in wakefulness.,No focal or epileptiform features were identif...,,,,,...,,,,,,,,,,{'CLINICAL HISTORY': 'A 55-year-old woman with...


In [80]:
def merge_columns(df, merge_to, merge_from, drop=True):
    # TODO: check that no data is overwritten by merging?
    assert merge_to in df.columns, "column {} not found in dataframe".format(merge_to)
    assert merge_from in df.columns, "column {} not found in dataframe".format(merge_from)
    df[merge_to][pd.isna(df[merge_to])] = df[merge_from][pd.isna(df[merge_to])]
    if drop:
        df = df.drop(merge_from, axis=1)
    return df
def merge_several_columns(df, merge_to, merge_from_several, drop=True):
    for column in merge_from_several:
        df = merge_columns(df, merge_to, column, drop=drop)

df = merge_several_columns(df, "CLINICAL CORRELATION", ["CORRELATION", "CLINICAL COURSE", "CLINICAL CORRELATIONS", "CLINICAL CORRELATE",
                                                        "CLINICAL INTERPRETATION", "CLINICAL CORR ELATION", "NOTE"])  # not sure about merging note

# %%
df = merge_several_columns(df, "CLINICAL HISTORY", ["HISTORY", "CLINICAL", "M CLINICAL HISTORY", "EEG REPORT CLINICAL HISTORY", 
                                                    "HOSPITAL COURSE", "BASELINE EEG CLINICAL HISTORY", "EEG NUMBER",
                                                    "ORIGINAL CLINICAL HISTORY"])

# %%
df = merge_several_columns(df, "DESCRIPTION OF THE RECORD", ["DESCRIPTION OF RECORD", "DESCRIPTION RECORD", "DESCRIPTION OF RECORDING", 
                                                             "DESCRIPTION OF THE RECORDING", "DESCRIPTION OF THE PROCEDURE", 
                                                             "DESCRIPTION OF BACKGROUND", "DESCRIPTION OF PROCEDURE", "OF THE RECORD",
                                                             "DESCRIPTION THE RECORD"])

# %%
df = merge_several_columns(df, "MEDICATIONS", ["MEDICATION", "CURRENT MEDICATIONS", "MEDICINES"])

# %%
df = merge_several_columns(df, "HEART RATE", ["HEAR RATE", "HR"])

# %%
df = merge_several_columns(df, "IMPRESSION", ["CLINICAL IMPRESSION"])

# %%
df = merge_several_columns(df, "FINDINGS", ["ABNORMAL FINDINGS"])

# %%
df = merge_several_columns(df, "EVENTS", ['SEIZURE EVENTS', 'SEIZURES OR EPISODES', 'EVENT', 'EPISODES', 'CLINICAL EVENTS',
                                          "EPISODES OR EVENTS", "EPISODES DURING THE RECORDING", "REFERRING FOR STUDY",
                                          "EVENTS OF PUSHBUTTON", "SEIZURES", "SEIZURE ACTIVITY"])

# %%
df = merge_several_columns(df, "TECHNICAL DIFFICULTIES", ["TECHNICAL PROBLEMS", "TECHNICAL DIFFICULTY", "CLINICAL DIFFICULTIES", "TECHNICAL DISCHARGES", 
                                                          "TECHNICAL NOTES", "TECHNICAL ISSUES", "TECHNIQUE DIFFICULTIES", "TECHNICAL CONSIDERATIONS",
                                                          "TECHNICAL QUALITY", "TECHNICAL", "ARTIFACTS"])

# %%
df = merge_several_columns(df, "CONDITION OF THE RECORDING", ["CONDITIONS OF THE RECORDING", "CONDITION OF RECORDING", 
                                                              "CONDITIONS OF RECORDING"])

# %%
df = merge_several_columns(df, "REASON FOR STUDY", ["REASON", "REASON FOR STUDIES", "REASON FOR EGG", "REASON FOR THE STUDY"])

# %%
df = merge_several_columns(df, "FINDINGS", ["DIAGNOSES", "DIAGNOSIS", "ABNORMAL DISCHARGES", "ABNORMAL DISCHARGE", 
                                            "EEG", "RECOMMENDATIONS"])  # not sure about merging recommendations

# %%
df = merge_several_columns(df, "PAST MEDICAL HISTORY", ["PAST HISTORY"])

# %%
df = merge_several_columns(df, "ACTIVATION PROCEDURES", ["ACTIVATION PROCEDURE", "ACTIVATING PROCEDURES", ])

# %%
df = merge_several_columns(df, "REASON FOR STUDY", ["REASON FOR EEG", "REASON FOR PROCEDURE"])

# %%
for drop_column in ["RECORDING TIMES", "RECORDING START TIME", "RECORDING END TIME", "RECORD FINISH TIME", "RECORD START TIME", 
                    "TOTAL LENGTH OF THE RECORDING", "RECORDING LENGTH", "TIME OF RECORDING", "LENGTH OF ELECTROENCEPHALOGRAM", 
                    "EEG LENGTH", "LENGTH OF EEG", "LENGTH OF PROCEDURE", "LENGTH OF THE RECORDING", "LENGTH OF THE EEG", 
                    "LENGTH OF RECORDING", "STUDY DATE", "DATE OF RECORDING", "EGG LENGTH", "TIME", "DURATION OF STUDY", 
                    "STUDY DURATION", "DATE OF THE RECORDING", "DATE OF STUDY", "DATES OF STUDY", "DT", "DD", "DENTAL PROBLEMS", 
                    "STAGES", "REASON FOR SEIZURES", "SEIZURES OR PUSHBUTTON EVENTS", "FEATURES", "INPATIENT ROOM", "EKG",
                    "DATE", "SLEEP"]:
    df = df.drop(drop_column, axis=1)

# %%
df.head(3)

AttributeError: 'NoneType' object has no attribute 'columns'

In [66]:
df_row

Unnamed: 0,path,year,month,day,subject,session,segment,age,gender,report,version,train,pathological
0,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,3,4196,3,0,53,F,CLINICAL HISTORY: Seizures.\nMEDICATIONS: Dil...,v2.0.0,False,False
1,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,3,5864,1,0,30,M,"CLINICAL HISTORY: Schizophrenia, memory loss.\...",v2.0.0,False,False
2,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,4,929,3,2,39,F,"CLINICAL HISTORY: Epilepsy, currently seizure...",v2.0.0,True,False
3,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,4,5851,1,1,65,F,CLINICAL HISTORY: Sixty-five-year-old woman w...,v2.0.0,False,False
4,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2009,9,9,5909,2,0,32,M,CLINICAL HISTORY: 32 year old male with episod...,v2.0.0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,10,7,6881,2,1,51,F,CLINICAL HISTORY: 50-year-old woman with a his...,v2.0.0,True,True
2989,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,10,7,7757,3,1,40,F,"CLINICAL HISTORY: A 38-year-old woman, with u...",v2.0.0,True,False
2990,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,10,7,9289,4,1,22,M,CLINICAL HISTORY: A 22-year-old with refractor...,v2.0.0,True,True
2991,/home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....,2013,10,10,3240,2,1,55,F,CLINICAL HISTORY: A 55-year-old woman with re...,v2.0.0,True,False


In [59]:
reports = tuh.description["report"]

In [35]:
reports = reports.to_list()

In [None]:
reports

In [48]:
import re

pattern = r'([A-Z][A-Z\s]+): (.*)'
categories = []
for report in reports:
    matches = re.findall(pattern, report)
    categories.append(matches)

In [None]:
categories

In [49]:
category_names = []  
category_content = []
for report_cats in categories:
    names = [c[0] for c in report_cats]
    content = [c[1] for c in report_cats]
    category_names.extend(names)
    category_content.extend(content)

In [56]:
from collections import Counter
category_counts = Counter(category_names)

categories = list(category_counts.keys())
counts = list(category_counts.values())

frequent_categories = {cat:count for cat, count in category_counts.items() if count > 50}

In [57]:
frequent_categories

{'CLINICAL HISTORY': 2887,
 'MEDICATIONS': 2869,
 'INTRODUCTION': 2850,
 'DESCRIPTION OF THE RECORD': 2864,
 'IMPRESSION': 2999,
 'CLINICAL CORRELATION': 2667,
 'HR': 1370,
 'HISTORY': 68,
 'DESCRIPTION OF RECORD': 58,
 'ABNORMAL DISCHARGES': 495,
 'REASON FOR STUDY': 647,
 'TECHNICAL DIFFICULTIES': 641,
 'SEIZURES': 556,
 'REASON FOR EEG': 63,
 'DESCRIPTION OF THE RECORDING': 53,
 'LENGTH OF RECORDING': 63,
 'LENGTH OF THE RECORDING': 77,
 'CONDITION OF THE RECORDING': 100,
 'HEART RATE': 106}

In [46]:
category_content

[' Seizures.',
 'Dilantin,  Lipitor.',
 ' Digital video EEG is performed in the lab using standard 10-20 system of electrode placement with 1 channel EKG. Hyperventilation and photic stimulation are performed. This is an awake and asleep record.',
 ' In wakefulness, there is a 9.8 Hz alpha rhythm with a small amount of low voltage frontal central beta activity. Hyperventilation produces a small amount of slowing. Photic stimulation does not activate the record.',
 'Normal electroencephalogram.',
 ' This is the third normal EEG for this individual with epilepsy. If appropriate, additional recording strategies may be helpful to characterize epileptiform activity.',
 'Schizophrenia, memory loss.',
 'Geodon, simvastatin, benztropine, Norvasc.',
 'Digital video EEG is performed in the lab using standard 10-20',
 'In wakefulness, background EEG is well organized',
 'Normal EEG.',
 'This prolonged outpatient sleep deprived EEG was',
 ' Epilepsy, currently seizure-free.',
 '  Lamictal, Keppra.

In [23]:
tuh.set_description(desc,overwrite=True)

In [26]:
tuh.description.path

0    /home/jovyan/mne_data/TUH/tuh_eeg_abnormal/v2....
Name: path, dtype: object