# Prepare Data
1. load power df
1. reindex with subject and condition
2. load behavioral df
3. melt behavioral on condition
4. join power and behavioral 
5. drop unnecessary rows and columns

In [4]:
# setup 
import pandas as pd
 
# open behavioral data and ids map
bh = pd.read_csv('data/behavioral_data/archived/plb_hyp_data.csv', index_col='index')
ids_map = pd.read_excel('docs/ids_map.xlsx', header=1, index_col='behavioral_id')
ids_map = ids_map.drop_duplicates('bids_id')
ids_map = ids_map[['bids_id']]
ids_map['bids_id'] = ids_map['bids_id'].apply(lambda x:str(x).zfill(2))
bh = bh.join(ids_map, how='right')
bh = bh.melt(
    id_vars=['procedure_type_1', 'procedure_type_2', 'procedure_type_3', 'procedure_type_4', 'bids_id',
             'description_type_1', 'description_type_2', 'description_type_3', 'description_type_4',
             'experiences_freetext_1', 'experiences_freetext_2', 'experiences_freetext_3', 'experiences_freetext_4'],
    value_vars=['hypnosis_depth_1', 'hypnosis_depth_2', 'hypnosis_depth_3', 'hypnosis_depth_4'])
bh['session'] = bh['variable'].apply(lambda x:x.split('_')[2])
bh['procedure'] = bh.apply(lambda r: r['procedure_type_'+r['session']], axis=1)
bh['description'] = bh.apply(lambda r: r['description_type_'+r['session']], axis=1)
bh['freetext'] = bh.apply(lambda r: r['experiences_freetext_'+r['session']], axis=1)

bh = bh[['bids_id', 'value', 'procedure', 'description', 'freetext', 'session']].sort_values(by=['bids_id', 'session']).set_index('bids_id')
bh = bh.rename(columns={'value':'hypnosis_depth'})
bh.reset_index(inplace=True)

# bh.to_csv('data/behavioral_data/behavioral_data_with_freetext.csv', index=0)

In [4]:
# open power data
power = pd.read_excel('docs/psds_new.xlsx', index_col='Unnamed: 0', header=1)
power.columns = power.columns.str.replace('lower_gamma', 'lowergamma')
power[['bids_id', 'condition']] = power.index.to_series().apply(lambda x:x.split('-')).apply(pd.Series)
power['session'] = power['condition'].apply(lambda x:x[-1])
power.reset_index(drop=True, inplace=True)
power = pd.merge(bh, power, how='right', on=['session', 'bids_id'], right_index=False)
power = power.sort_values(by=['bids_id', 'session', 'condition']).reset_index(drop=True)
power.insert(1, 'condition', power.pop('condition'))
# power.to_csv('data/classification_dataset.csv')
power.head()

In [None]:
# debug
# power = power.query('condition.str.contains("experience")', engine='python')

In [6]:
DATA = pd.read_csv(f'data/classification_datasets/tobedeleted/plv_source.csv')
DATA.sort_values(by=['bids_id', 'session'], inplace=True)
DATA.drop(columns=['bids_id', 'session', 'condition','Unnamed: 0.1',
                   'Unnamed: 0'], inplace=True)
# DATA.to_csv(f'data/classification_datasets/plv_source.csv', index=False)

[]

## data for PyEEG workshop

In [120]:
import pandas as pd
import re
 
# open data with free text and hypnotizability
hypnotizability = pd.read_excel('data/behavioral_data/archived/PLB_HYP_data_MASTER.xlsx', index_col=0).loc[9:][['bids_id', 'hypnotizability_total', 'unique_id']]
hypnotizability['bids_id'] = hypnotizability['bids_id'].apply(lambda x:str(int(x)).zfill(2))
hypnotizability.set_index('bids_id', inplace=True)

data = pd.read_csv('data/behavioral_data/behavioral_data_with_freetext.csv')
data = data.query('description == "hypnosis"')
data = data.query('procedure == "confusion" or procedure == "relaxation"')
data.set_index('bids_id', inplace=True)

# append hypnotizability
data = data.join(hypnotizability, how='right')

data['deep_hypnosis'] = data['hypnosis_depth'].apply(lambda x:True if x>=5 else False)
data['z_score'] = data['hypnotizability_total'].transform(lambda x: (x - x.mean())/x.std())
data.drop(columns=['description', 'session'], inplace=True)

# data.dropna().iloc[10:]['z_score'].plot(kind='hist')
# data.to_csv('data/behavioral_data/PyEEG_behavioral_data.csv', index=0)

In [8]:
# open master behavioral file to get the
data = pd.read_excel('data/behavioral_data/archived/PLB_HYP_data_MASTER.xlsx', index_col=0)
data = data[['unique_id', 'bids_id', 'hypnotizability_total',
      'description_type_1', 'description_type_2', 'description_type_3', 'description_type_4']]

# open translations
data_en = pd.read_excel('../text-analysis/Translation of experience reports.xlsx')


data_en.rename(columns=
               {'blinded_anonymized_text2_ENG': 'blinded_anonymized_text1_ENG',
               'blinded_anonymized_text2_ENG.1': 'blinded_anonymized_text2_ENG'},
               inplace=True)

# merge two dataframes
data_en = data_en.merge(data, on='unique_id')

# unpivot
data_en = data_en.melt(id_vars=['original_text_1', 'original_text_2', 'original_text_3', 'original_text_4',
                      'blinded_anonymized_text1_HUN', 'blinded_anonymized_text2_HUN',
                      'blinded_anonymized_text3_HUN', 'blinded_anonymized_text4_HUN',
                      'blinded_anonymized_text1_ENG', 'blinded_anonymized_text2_ENG',
                      'blinded_anonymized_text3_ENG', 'blinded_anonymized_text4_ENG',
                      'description_type_1', 'description_type_2', 'description_type_3', 'description_type_4',
                      'procedure_type_1', 'procedure_type_2', 'procedure_type_3', 'procedure_type_4',
                      'unique_id', 'bids_id'],
             value_vars=['hypnosis_depth_1', 'hypnosis_depth_2', 'hypnosis_depth_3', 'hypnosis_depth_4'])

data_en['session'] = data_en['variable'].apply(lambda x:x.split('_')[2])
data_en['procedure'] = data_en.apply(lambda r: r['procedure_type_'+r['session']], axis=1)
data_en['description'] = data_en.apply(lambda r: r['description_type_'+r['session']], axis=1)
data_en['original_text'] = data_en.apply(lambda r: r['original_text_'+r['session']], axis=1)
data_en['blinded_anonymized_HUN'] = data_en.apply(lambda r: r['blinded_anonymized_text'+r['session']+'_HUN'], axis=1)
data_en['blinded_anonymized_HUN'] = data_en.apply(lambda r: r['blinded_anonymized_text'+r['session']+'_HUN'], axis=1)
data_en['blinded_anonymized_ENG'] = data_en.apply(lambda r: r['blinded_anonymized_text'+r['session']+'_ENG'], axis=1)
data_en = data_en.iloc[:, -10:].drop(columns='variable')

# only choose real trials that are descripted as hypnosis
# data_en = data_en.query('description == "hypnosis"')
# data_en = data_en.query('procedure == "confusion" or procedure == "relaxation"')
data_en = data_en.rename(columns={'value': 'score'})
data_en = data_en.dropna()

data_en.reset_index(drop=True, inplace=True)

# data_en.to_csv('behavioral_data_with_freetext.csv', index=0)

data = data_en
del data_en

In [261]:
redacted_dict = {}

for i in range(len(data)):
    text_hun = data.loc[i, 'blinded_anonymized_HUN']
    if '[' in text_hun:
        redacted = re.finditer('[\[\]]', text_hun)
        l1 = [m.start() for m in redacted]
        
        s_original = ''
        s_blinded = ''
        for j in range(int(len(l1)/2)):
            s_original += data.loc[i, 'original_text'][l1[2*j]:l1[2*j+1]+1]
            s_blinded += text_hun[l1[2*j]:l1[2*j+1]+1]
        
        redacted_dict[f'{i}_original'] = s_original
        redacted_dict[f'{i}_Hun'] = s_blinded
        
        redacted = re.finditer('[\[\]]', data.loc[i, 'blinded_anonymized_ENG'])
        l1 = [m.start() for m in redacted]
        
        s_english = ''
        for j in range(int(len(l1)/2)):
            s_english += data.loc[i, 'blinded_anonymized_ENG'][l1[2*j]:l1[2*j+1]+1]
            
        redacted_dict[f'{i}_Eng'] = s_english


In [264]:
redacted_df = pd.DataFrame.from_dict(redacted_dict, orient='index', columns=['text']).reset_index()
redacted_df['id'] = redacted_df['index'].apply(lambda x:x.split('_')[0]).astype('int')
redacted_df['language'] = redacted_df['index'].apply(lambda x:x.split('_')[1])
redacted_df.drop(columns='index', inplace=True)
redacted_df = redacted_df.pivot(index='id', columns='language', values='text').sort_values(by='id')

redacted_df[['original_text', 'blinded_anonymized_HUN', 'blinded_anonymized_ENG']] = data.iloc[redacted_df.index][['original_text', 'blinded_anonymized_HUN', 'blinded_anonymized_ENG']]
# redacted_df.to_csv('redacted_texts.csv')

In [9]:
data_redacted = pd.read_excel('../redacted_text.xlsx', header=1)
data_redacted.set_index('id', inplace=True)
for i in data_redacted.index:
     data.loc[i, 'blinded_anonymized_ENG'] = data_redacted.loc[i, 'blinded_anonymized_ENG']

data.to_csv('data_with_english_text.csv', index=0)

### Create data for Workshop

In [59]:
new_data = data.query('description == "hypnosis"')[['bids_id', 'score', 'procedure', 'original_text', 'blinded_anonymized_ENG']]
new_data.rename(columns={'bids_id': 'id',
                         'score': 'hypnosis_depth',
                         'blinded_anonymized_ENG': 'eng_text'},
                inplace=True)
new_data = new_data[['id','procedure', 'hypnosis_depth', 'eng_text', 'original_text']]
# new_data.sort_values('id', inplace=True)
new_data.to_csv('data/behavioral_data/PyEEGW_behavioral_data.csv', index=0)