# Metadata Extraction and Analysis

## Imports and dataloading

In [5]:
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from ieeg.auth import Session
from scipy import signal as sig
import os
from os.path import join as ospj
from os.path import exists as ospe
import pathlib
from tqdm import tqdm

# Statistical imports
from scipy.stats import chi2_contingency, fisher_exact
from statsmodels.stats.proportion import proportions_ztest
import statsmodels.api as sm

from utils import *

In [6]:
import sys
sys.path.append('/users/wojemann/iEEG_processing')
from pioneer import Pioneer

In [7]:
usr,passpath,datapath,prodatapath,metapath,figpath,patient_table,rid_hup,pt_list = \
load_config(ospj('/mnt/leif/littlab/users/wojemann/stim-seizures/code','config.json'),'CHOP')

In [8]:
patient_table

Unnamed: 0,ptID,ieeg_ids,lf_stim,hf_stim,typical,mtle,focality,laterality,interictal_training
0,CHOP005,"[CHOPCCEP_005, CHOP005]",1,0,,,,,"[CHOP005, 14190.17]"
1,CHOP010,"[CHOPCCEP_010, CHOP010a, CHOP010b, CHOP010c]",1,0,,,,,[]
2,CHOP024,"[CHOPCCEP_024, CHOP024]",1,0,,,,,"[CHOP024, 112138.27]"
3,CHOP026,"[CHOPCCEP_026, CHOP026]",1,0,,,,,"[CHOP026, 76411.33]"
4,CHOP028,"[CHOPCCEP_028, CHOP028]",1,0,,,,,"[CHOP028, 7517.56]"
5,CHOP035,"[CHOPCCEP_035, CHOP035]",1,0,,,,,"[CHOP035, 82282.0]"
6,CHOP037,"[CHOPCCEP_037, CHOP037]",1,0,,,,,"[CHOP037, 58173.01]"
7,CHOP038,"[CHOPCCEP_038, CHOP038]",1,0,,,,,[]
8,CHOP041,"[CHOPCCEP_041, CHOP041]",1,1,,,,,"[CHOP041, 112959.7]"
9,CHOP044,"[CHOPCCEP_044, CHOP044]",1,0,,,,,"[CHOP044, 4070.79]"


## Creating annotation assignments

In [9]:
seizures_df = pd.read_csv(ospj(metapath,'stim_seizure_information - LF_seizure_annotation.csv'))
seizures_df = seizures_df[~seizures_df.Patient.isin(["HUP235","HUP238","HUP246","HUP261"])]
seizures_df = seizures_df[seizures_df['to_annotate']==1]
seizures_df['annotators'] = ""
seizures_df['approximate_onset'].fillna(seizures_df['UEO'],inplace=True)
seizures_df['approximate_onset'].fillna(seizures_df['EEC'],inplace=True)
seizures_df['approximate_onset'].fillna(seizures_df['Other_onset_description'],inplace=True)
seizures_df = seizures_df.drop(['to_annotate','Notes','source','EEC onset channels','UEO onset channels','EEC','UEO','Other_onset_description'],axis=1).reset_index(drop=True)
seizures_df.head()

Unnamed: 0,Patient,IEEGname,approximate_onset,end,stim,stim_channels,typical,LVFA,Summaries,annotators
0,HUP224,HUP224_phaseII,71156.59,71190.99,1.0,LB1-LB2,0.0,0.0,456.0,
1,HUP224,HUP224_phaseII,339143.6435,339234.2,0.0,,,,,
2,HUP224,HUP224_phaseII,491467.8046,491541.43,0.0,,,,,
3,HUP224,HUP224_phaseII,519177.95,519258.16,0.0,,,,,
4,HUP225,HUP225_phaseII,159834.14,159913.05,1.0,RC1-RC2,1.0,0.0,,


In [6]:
# Assuming you have a DataFrame named 'seizures_df' containing seizure data
# And a list of annotators initials 2,5
np.random.seed(10)
annotators = ['CK','EC','DZ','JJ','JK']
annotation_counts = {key: 0 for key in annotators}
def calc_weights(annotation_counts):
    weights = [1/(1+value) for value in annotation_counts.values()]
    tot_weight = sum(weights)
    return [w/tot_weight for w in weights]

# Assuming 'seizures_df' contains a column 'patient_id' indicating the patient ID for each seizure
# We'll first group seizures by patient_id
grouped_seizures = seizures_df.groupby('Patient')

# Dictionary to store DataFrames for each annotator
annotator_dfs = {}

# Create Primary DF that contains all seizures from all patients with all annotators per seizure
# Iterate over each patient group
for patient_id, patient_group in grouped_seizures:
    num_seizures = len(patient_group)
    # Randomly assign 3 annotators to the patient
    weights = calc_weights(annotation_counts)
    assigned_annotators = np.random.choice(annotators, size=3, replace=False,p=weights)
    annot_str = str(assigned_annotators)
    
    annotator_list = np.repeat(annot_str,num_seizures,0)
    # if len(annotator_list.shape) < 2:
    #     annotator_list = np.expand_dims(annotator_list,0)
    
    seizures_df.iloc[seizures_df.Patient == patient_id,-1] = annotator_list
    # Repeat the annotators for each seizure in the patient group
    for annotator in assigned_annotators:
        annotation_counts[annotator] += len(patient_group)
        if annotator in annotator_dfs.keys():
            annotator_dfs[annotator] = pd.concat([annotator_dfs[annotator],patient_group])
        else:
            annotator_dfs[annotator] = patient_group
print(annotation_counts)

{'CK': 34, 'EC': 42, 'DZ': 33, 'JJ': 35, 'JK': 36}


In [18]:
for key in annotator_dfs.keys():
    annotator_dfs[key][["UEO_time","UEO_ch","10sec_ch"]] = ""
    annotator_dfs[key].to_csv(ospj(prodatapath,f"stim_seizure_annotations_{key}.csv"),index=False)
seizures_df.to_csv(ospj(prodatapath,"LF_seizure_annotations_wannotator.csv"),index=False)

## Extracting seizure annotations from iEEG

In [13]:
# for i,pt in patient_table.iloc[[-2],:].iterrows():
for i,pt in patient_table[patient_table.ptID == 'CHOP032'].iterrows():
    for ieeg_pt in pt.ieeg_ids:
        # try:
            save_path = ospj(datapath,pt.ptID)
            print(ieeg_pt,save_path)
            if not ospe(save_path):
                os.makedirs(save_path)
            wagon = Pioneer(usr,passpath,ieeg_pt)
            wagon.pull_annotations()
            wagon.filter_seizure_annotations()
            wagon.seizure_annotations.to_csv(ospj(save_path,f'seizure_annotations_{ieeg_pt}.csv'))
        # except:
        #     continue

CHOPCCEP_032 /mnt/sauce/littlab/users/wojemann/stim-seizures/RAW_DATA/CHOP032
got 100 annotations on call # 1 covering 22925780761 usec to 25072323242 usec
got 100 annotations on call # 2 covering 25073326171 usec to 25969639160 usec
got 100 annotations on call # 3 covering 25971192871 usec to 26801372070 usec
got 100 annotations on call # 4 covering 26802388671 usec to 27668140625 usec
got 100 annotations on call # 5 covering 27673568847 usec to 28484145996 usec
got 100 annotations on call # 6 covering 28485167480 usec to 29377541015 usec
got 100 annotations on call # 7 covering 29377543945 usec to 30479178711 usec
got 100 annotations on call # 8 covering 30481079589 usec to 31405335937 usec
got 72 annotations on call # 9 covering 31520166015 usec to 33032063964 usec
Filtered       9.97% of all annotations
CHOP032 /mnt/sauce/littlab/users/wojemann/stim-seizures/RAW_DATA/CHOP032
got 100 annotations on call # 1 covering 49212890 usec to 32032358398 usec
got 100 annotations on call # 2 c

## StimSz metadata analysis

In [10]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load data
# Load data
stim_df = pd.read_csv(ospj(metapath,'stim_seizure_information - metadata-4.csv'))
chop_df = pd.read_csv(ospj(metapath,'CHOP_metadata.csv'))


# Data formatting
stim_df['sex'] = stim_df['sex'].map({1: 'M', 2: 'F'})
chop_df['sex'] = chop_df['sex'].map({'M': 'M', 'F': 'F'})  # already formatted

stim_df['mtle'] = stim_df['localization'].apply(lambda x: 1 if isinstance(x, str) and 'MTLE' in x else 0)
stim_df['lesional'] = stim_df['lesional'].apply(lambda x: 1 if x == 2 else 0)

stim_df['center'] = 'HUP'
chop_df['center'] = 'CHOP'

# Harmonize column names
chop_df = chop_df.rename(columns={
    'ptID': 'record_id',
    'age at epilepsy onset': 'age_at_onset',
    'duration of epilepsy prior to stim (y)': 'duration'
})

# Select and align columns
stim_cols = ['record_id', 'sex', 'age_at_onset', 'duration', 'lesional', 'mtle', 'unifocal', 'stim_sz', 'center']
chop_cols = ['record_id', 'sex', 'age_at_onset', 'duration', 'lesional', 'mtle', 'unifocal', 'stim_sz', 'center']

combined_df = pd.concat([
    stim_df[stim_cols],
    chop_df[chop_cols]
], ignore_index=True)

# Variables to analyze
categorical_vars = ['sex', 'center', 'lesional', 'mtle', 'unifocal']
continuous_vars = ['age_at_onset', 'duration']
response_var = 'stim_sz'

summary_rows = []

# Categorical variables
for var in categorical_vars:
    group_counts = combined_df.groupby(var)[response_var].agg(['count', 'sum'])
    group_counts['percent'] = (group_counts['sum'] / group_counts['count'] * 100).round(1)
    contingency_table = pd.crosstab(combined_df[var], combined_df[response_var])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    # Cohen's w effect size
    n = contingency_table.values.sum()
    w = np.sqrt(chi2 / n)
    for group in group_counts.index:
        summary_rows.append({
            'Variable or Group': var,
            'Group': group,
            'Total Group': int(group_counts.loc[group, 'count']),
            'With Stim Seizure': int(group_counts.loc[group, 'sum']),
            'Response Rate, %': group_counts.loc[group, 'percent'],
            'P Value': round(p, 3),
            'Effect Size': round(w, 3)
        })

# Continuous variables
for var in continuous_vars:
    means = combined_df.groupby(response_var)[var].mean()
    stds = combined_df.groupby(response_var)[var].std()
    groups = [combined_df[combined_df[response_var] == val][var].dropna() for val in combined_df[response_var].unique()]
    f_stat, p = f_oneway(*groups)
    # Eta squared effect size
    model = ols(f'{var} ~ C({response_var})', data=combined_df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    eta_sq = anova_table['sum_sq'][0] / anova_table['sum_sq'].sum()
    for stim_val in combined_df[response_var].unique():
        summary_rows.append({
            'Variable or Group': var,
            'Group': f"stim_sz={stim_val}",
            'Total Group': len(combined_df[combined_df[response_var] == stim_val]),
            # 'With Stim Seizure': round(means[stim_val], 2),
            # 'Response Rate, %': round(stds[stim_val], 2),
            'With Stim Seizure': np.nan,
            'Response Rate, %': round(means[stim_val], 2),
            'P Value': round(p, 3),
            'Effect Size': round(eta_sq, 3)
        })

# Final summary table
summary_df = pd.DataFrame(summary_rows)

# Display table
print(summary_df)


   Variable or Group      Group  Total Group  With Stim Seizure  \
0                sex          F           54               20.0   
1                sex          M           48               18.0   
2             center       CHOP           50               17.0   
3             center        HUP           55               21.0   
4           lesional        0.0           52               22.0   
5           lesional        1.0           50               16.0   
6               mtle        0.0           61               15.0   
7               mtle        1.0           38               21.0   
8           unifocal        0.0           34               13.0   
9           unifocal        1.0           65               23.0   
10      age_at_onset  stim_sz=0           67                NaN   
11      age_at_onset  stim_sz=1           38                NaN   
12          duration  stim_sz=0           67                NaN   
13          duration  stim_sz=1           38                Na

In [12]:
import pandas as pd
import numpy as np
import scipy.io
from scipy.stats import chi2_contingency, f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols

# --- Load metadata ---
stim_df = pd.read_csv(ospj(metapath,'stim_seizure_information - metadata-4.csv'))
chop_df = pd.read_csv(ospj(metapath,'CHOP_metadata.csv'))

# --- Format metadata ---
stim_df['sex'] = stim_df['sex'].map({1: 'M', 2: 'F'})
stim_df['mtle'] = stim_df['localization'].apply(lambda x: 1 if isinstance(x, str) and 'MTLE' in x else 0)
stim_df['lesional'] = stim_df['lesional'].apply(lambda x: 1 if x == 2 else 0)
stim_df['center'] = 'HUP'
stim_df['record_id'] = stim_df['record_id'].astype(str)

chop_df['sex'] = chop_df['sex'].map({'M': 'M', 'F': 'F'})
chop_df['center'] = 'CHOP'
chop_df['record_id'] = chop_df['ptID'].astype(str)
chop_df['duration'] = chop_df['duration of epilepsy prior to stim (y)']
chop_df['age_at_onset'] = chop_df['age at epilepsy onset']

# --- Load and extract percent channels stimulated from .mat files ---
def extract_percent_stim(matfile, center):
    pts = matfile['pt'][0]
    pts_df = pd.DataFrame(pts)
    if center == 'HUP':
        pts_df['name'] = pts_df['name'].apply(lambda x: int(x[0][-3:]))
    else:
        pts_df['name'] = pts_df['name'].apply(lambda x: x[0])
    pts_df['nchs'] = pts_df['nchs'].apply(lambda x: x[0])
    pts_df['nstim'] = pts_df['nstim'].apply(lambda x: x[0])
    pts_df['percent_channels_stimulated'] = pts_df['nstim']/pts_df['nchs']*100
    pts_df['center'] = center    
    return pts_df[['name','nchs','nstim','percent_channels_stimulated','center']]

stim_info = scipy.io.loadmat(ospj(metapath,'stim_info.mat'))
stim_info_chop = scipy.io.loadmat(ospj(metapath,'stim_info_chop.mat'))
stim_stim = extract_percent_stim(stim_info, 'HUP')
stim_chop = extract_percent_stim(stim_info_chop, 'CHOP')
stim_chop['name'] = stim_chop.name.apply(lambda x: ''.join(x.split('CCEP_')))
# --- Merge percent_channels_stimulated into metadata ---
def merge_percent_stim(meta_df, stim_df, id_col):
    # Use a fuzzy match if needed, here we use exact match for demonstration
    merged = meta_df.merge(stim_df[['name', 'percent_channels_stimulated']], how='left',
                           left_on=id_col, right_on='name')
    merged['percent_channels_stimulated'] = merged['percent_channels_stimulated'].astype(float)
    return merged

stim_df = merge_percent_stim(stim_df, stim_stim, 'hupsubjno')
chop_df = merge_percent_stim(chop_df, stim_chop, 'ptID')
# chop_df.dropna(subset='mtle',inplace=True)
# stim_df.dropna(subset='mtle',inplace=True)
# --- Combine all data ---
stim_cols = ['name', 'sex', 'age_at_onset', 'duration', 'lesional', 'mtle', 'unifocal', 'stim_sz', 'center', 'percent_channels_stimulated']
chop_cols = ['name', 'sex', 'age_at_onset', 'duration', 'lesional', 'mtle', 'unifocal', 'stim_sz', 'center', 'percent_channels_stimulated']

combined_df = pd.concat([stim_df[stim_cols], chop_df[chop_cols]], ignore_index=True)

# --- Summary Table Construction ---
summary_rows = []

# Categorical variables
for var in ['sex', 'center', 'lesional', 'mtle', 'unifocal']:
    group_counts = combined_df.groupby(var)['stim_sz'].agg(['count', 'sum'])
    group_counts['percent'] = (group_counts['sum'] / group_counts['count'] * 100).round(1)
    contingency_table = pd.crosstab(combined_df[var], combined_df['stim_sz'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.values.sum()
    w = np.sqrt(chi2 / n)
    for group in group_counts.index:
        summary_rows.append({
            'Variable or Group': var,
            'Group': group,
            'Total Group': int(group_counts.loc[group, 'count']),
            'With Stim Seizure': int(group_counts.loc[group, 'sum']),
            'Response Rate, %': group_counts.loc[group, 'percent'],
            'P Value': round(p, 3),
            'Effect Size': round(w, 3)
        })

# Continuous variables
for var in ['age_at_onset', 'duration', 'percent_channels_stimulated']:
    for stim_val in [0, 1]:
        subset = combined_df[combined_df['stim_sz'] == stim_val][var].dropna()
        mean = subset.mean()
        std = subset.std()
        summary_rows.append({
            'Variable or Group': var,
            'Group': f"stim_sz={stim_val}",
            'Total Group': len(subset),
            'With Stim Seizure': round(mean, 2),
            'Response Rate, %': round(std, 2),
            'P Value': '',  # Will fill below
            'Effect Size': ''
        })
    # ANOVA and effect size
    groups = [combined_df[combined_df['stim_sz'] == val][var].dropna() for val in [0, 1]]
    if all(len(g) > 1 for g in groups):
        f_stat, p = f_oneway(*groups)
        model = ols(f'{var} ~ C(stim_sz)', data=combined_df).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        eta_sq = anova_table['sum_sq'][0] / anova_table['sum_sq'].sum()
        # Fill in the last two rows (for stim_sz=0 and stim_sz=1)
        summary_rows[-2]['P Value'] = round(p, 3)
        summary_rows[-1]['P Value'] = round(p, 3)
        summary_rows[-2]['Effect Size'] = round(eta_sq, 3)
        summary_rows[-1]['Effect Size'] = round(eta_sq, 3)

summary_df = pd.DataFrame(summary_rows)

# --- Display the summary table ---
print(summary_df)


              Variable or Group      Group  Total Group  With Stim Seizure  \
0                           sex          F           54              20.00   
1                           sex          M           48              18.00   
2                        center       CHOP           50              17.00   
3                        center        HUP           55              21.00   
4                      lesional        0.0           52              22.00   
5                      lesional        1.0           50              16.00   
6                          mtle        0.0           61              15.00   
7                          mtle        1.0           38              21.00   
8                      unifocal        0.0           34              13.00   
9                      unifocal        1.0           65              23.00   
10                 age_at_onset  stim_sz=0           63              15.82   
11                 age_at_onset  stim_sz=1           37         