In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt

## Baseline

In [27]:
n_trials = 0
file_names = []

root_dir = '/mlodata1/lugeon/dream_data/chuv/baseline'

for file in glob.glob(f'{root_dir}/*/*.mat'):
    file_names.append(file.split('/')[-1].split('.')[0])
    n_trials += 1
    
print(f'Number of trials: {n_trials}')

Number of trials: 161


In [28]:
df_files = pd.DataFrame({'file_name': file_names})
df_files['label'] = -1
df_files['sleep_cycle'] = df_files.file_name.apply(lambda f: int(f.split('_c')[-1]))
df_files['sleep_stage'] = df_files.file_name.apply(lambda f: 2 if 'NREM' in f else 4)
df_files['elapsed_time'] = -1

In [29]:
df_files.head()

Unnamed: 0,file_name,label,sleep_cycle,sleep_stage,elapsed_time
0,NREM_c5,-1,5,2,-1
1,NREM_c6,-1,6,2,-1
2,REM_c1,-1,1,4,-1
3,REM_c2,-1,2,4,-1
4,REM_c3,-1,3,4,-1


In [30]:
df_files.to_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/baseline_metadata.txt', 
    header=True, 
    sep=' ', 
    index=None,
    float_format='%.0f')

In [31]:
pd.read_csv('/mlodata1/lugeon/dream_data/chuv/metadata/baseline_metadata.txt', sep=' ', header=0)

Unnamed: 0,file_name,label,sleep_cycle,sleep_stage,elapsed_time
0,NREM_c5,-1,5,2,-1
1,NREM_c6,-1,6,2,-1
2,REM_c1,-1,1,4,-1
3,REM_c2,-1,2,4,-1
4,REM_c3,-1,3,4,-1
...,...,...,...,...,...
156,REM_c2,-1,2,4,-1
157,REM_c3,-1,3,4,-1
158,REM_c4,-1,4,4,-1
159,NREM_c1,-1,1,2,-1


## Healthy

In [71]:
n_trials = 0
file_names = []

root_dir = '/mlodata1/lugeon/dream_data/chuv/healthy'


for file in glob.glob(f'{root_dir}/*/H*.mat'):
    file_names.append(file.split('/')[-1].split('.')[0])
    n_trials += 1

print(f'Number of H trials: {n_trials}')

Number of H trials: 525


In [72]:
df_files = pd.DataFrame({'file_name': file_names})

df_files['split'] = df_files.file_name.apply(lambda f: f.split('_'))
df_files['subject'] = df_files.split.apply(lambda l: l[0])
df_files['experiment'] = df_files.split.apply(lambda l: l[1])
df_files['trial'] = df_files.split.apply(lambda l: int(l[3][1:]))
df_files['sleep_cycle'] = -1

df_files.split.apply(len).value_counts()

df_files.head()

Unnamed: 0,file_name,split,subject,experiment,trial,sleep_cycle
0,H009_E1_NREM_S03,"[H009, E1, NREM, S03]",H009,E1,3,-1
1,H009_E1_NREM_S05,"[H009, E1, NREM, S05]",H009,E1,5,-1
2,H009_E1_NREM_S06,"[H009, E1, NREM, S06]",H009,E1,6,-1
3,H009_E1_NREM_S07,"[H009, E1, NREM, S07]",H009,E1,7,-1
4,H009_E1_NREM_S09,"[H009, E1, NREM, S09]",H009,E1,9,-1


In [73]:
df_trial = pd.read_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/original/time_since_light_off.csv',
    names=['subject_id', 'trial', 'elapsed_time', 'sleep_stage', 'label'],
    header=0)

df_trial['subject'] = df_trial.subject_id.apply(lambda s: s.split('_')[0])
df_trial['experiment'] = df_trial.subject_id.apply(lambda s: s.split('_')[-1])

In [74]:
df_trial

Unnamed: 0,subject_id,trial,elapsed_time,sleep_stage,label,subject,experiment
0,M001_E1,2,38.0,2,2.0,M001,E1
1,M001_E1,4,86.0,2,2.0,M001,E1
2,M001_E1,8,170.0,2,2.0,M001,E1
3,M001_E1,9,193.0,2,2.0,M001,E1
4,M001_E1,10,208.0,2,2.0,M001,E1
...,...,...,...,...,...,...,...
1172,H100_E2,15,376.0,2,1.0,H100,E2
1173,H100_E2,16,394.0,2,2.0,H100,E2
1174,H100_E2,1,19.0,3,2.0,H100,E2
1175,H100_E2,2,40.0,3,2.0,H100,E2


In [75]:
df_merged = df_files.merge(df_trial, how='outer')
df_merged.head()

Unnamed: 0,file_name,split,subject,experiment,trial,sleep_cycle,subject_id,elapsed_time,sleep_stage,label
0,H009_E1_NREM_S03,"[H009, E1, NREM, S03]",H009,E1,3,-1.0,H009_E1,50.0,3,1.0
1,H009_E1_NREM_S05,"[H009, E1, NREM, S05]",H009,E1,5,-1.0,H009_E1,154.0,2,1.0
2,H009_E1_NREM_S06,"[H009, E1, NREM, S06]",H009,E1,6,-1.0,H009_E1,169.0,2,1.0
3,H009_E1_NREM_S07,"[H009, E1, NREM, S07]",H009,E1,7,-1.0,H009_E1,193.0,2,1.0
4,H009_E1_NREM_S09,"[H009, E1, NREM, S09]",H009,E1,9,-1.0,H009_E1,323.0,2,2.0


In [76]:
df_valid = df_merged[(df_merged.label.notna()) & df_merged.file_name.notna()]

# Number of trials with both data + label
df_valid.shape

(524, 10)

In [77]:
df_valid.isna().sum()

file_name       0
split           0
subject         0
experiment      0
trial           0
sleep_cycle     0
subject_id      0
elapsed_time    0
sleep_stage     0
label           0
dtype: int64

In [78]:
df_valid.sleep_stage.value_counts()

2    224
3    177
4    123
Name: sleep_stage, dtype: int64

In [79]:
features_name = ['file_name', 'label', 'sleep_cycle', 'sleep_stage', 'elapsed_time']

df_valid[features_name].to_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/healthy_metadata.txt', 
    sep=' ', 
    index=None,
    float_format='%.0f')

In [80]:
n_samples = 10

df_valid[features_name].sample(n_samples, random_state=42).to_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/healthy_metadata_small.txt', 
    sep=' ', 
    index=None,
    float_format='%.0f')

In [81]:
mask = df_valid.label.apply(lambda l: l in {0, 2}) & df_valid.sleep_stage.apply(lambda s: s in {2, 3})
df_binary = df_valid[mask]
df_binary.label.replace(2, 1, inplace=True)
df_binary.groupby(['sleep_stage', 'label']).file_name.count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_binary.label.replace(2, 1, inplace=True)


sleep_stage  label
2            0.0      43
             1.0      81
3            0.0      40
             1.0      76
Name: file_name, dtype: int64

In [83]:
df_binary[features_name].to_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/healthy_metadata_nrem_binary.txt', 
    sep=' ', 
    index=None,
    float_format='%.0f')

In [86]:
# weight for balanced training
df_binary.shape[0] / (2 * np.bincount(df_binary.label))

array([1.44578313, 0.76433121])

## Madison

In [87]:
n_trials = 0
file_names = []

root_dir = '/mlodata1/lugeon/dream_data/chuv/madison'


for file in glob.glob(f'{root_dir}/*/M*.mat'):
    file_names.append(file.split('/')[-1].split('.')[0])
    n_trials += 1

print(f'Number of M trials: {n_trials}')

Number of M trials: 784


In [88]:
df_files = pd.DataFrame({'file_name': file_names})

df_files['split'] = df_files.file_name.apply(lambda f: f.split('_'))
df_files['subject'] = df_files.split.apply(lambda l: l[0])
df_files['experiment'] = df_files.split.apply(lambda l: l[1])
df_files['trial'] = df_files.split.apply(lambda l: int(l[3][1:]))
df_files['sleep_cycle'] = -1

df_files.split.apply(len).value_counts()

df_files.head()

Unnamed: 0,file_name,split,subject,experiment,trial,sleep_cycle
0,M001_E10_NREM_S1,"[M001, E10, NREM, S1]",M001,E10,1,-1
1,M001_E10_NREM_S2,"[M001, E10, NREM, S2]",M001,E10,2,-1
2,M001_E10_NREM_S4,"[M001, E10, NREM, S4]",M001,E10,4,-1
3,M001_E10_NREM_S6,"[M001, E10, NREM, S6]",M001,E10,6,-1
4,M001_E10_REM_S3,"[M001, E10, REM, S3]",M001,E10,3,-1


In [89]:
df_trial = pd.read_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/original/time_since_light_off.csv',
    names=['subject_id', 'trial', 'elapsed_time', 'sleep_stage', 'label'],
    header=0)

df_trial['subject'] = df_trial.subject_id.apply(lambda s: s.split('_')[0])
df_trial['experiment'] = df_trial.subject_id.apply(lambda s: s.split('_')[-1])

In [90]:
df_merged = df_files.merge(df_trial, how='outer')

In [91]:
df_valid = df_merged[(df_merged.label.notna()) & df_merged.file_name.notna()]

# Number of trials with both data + label
df_valid.shape

(593, 10)

In [92]:
df_valid.isna().sum()

file_name       0
split           0
subject         0
experiment      0
trial           0
sleep_cycle     0
subject_id      0
elapsed_time    0
sleep_stage     0
label           0
dtype: int64

In [93]:
df_valid.groupby(['sleep_stage', 'label']).file_name.count()

sleep_stage  label
2.0          0.0       73
             1.0      129
             2.0      133
3.0          0.0       95
             1.0      100
             2.0       63
Name: file_name, dtype: int64

In [94]:
features_name = ['file_name', 'label', 'sleep_cycle', 'sleep_stage', 'elapsed_time']

df_valid[features_name].to_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/madison_metadata.txt', 
    sep=' ', 
    index=None,
    float_format='%.0f')

In [95]:
mask = df_valid.label.apply(lambda l: l in {0, 2}) & df_valid.sleep_stage.apply(lambda s: s in {2, 3})
df_binary = df_valid[mask]
df_binary.label.replace(2, 1, inplace=True)
df_binary.groupby(['sleep_stage', 'label']).file_name.count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_binary.label.replace(2, 1, inplace=True)


sleep_stage  label
2.0          0.0       73
             1.0      133
3.0          0.0       95
             1.0       63
Name: file_name, dtype: int64

In [96]:
df_binary[features_name].to_csv(
    '/mlodata1/lugeon/dream_data/chuv/metadata/madison_metadata_nrem_binary.txt', 
    sep=' ', 
    index=None,
    float_format='%.0f')

In [97]:
# weight for balanced training
df_binary.shape[0] / (2 * np.bincount(df_binary.label))

array([1.08333333, 0.92857143])