# Merging the NIRs data with text columns

In [None]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

# all data
data_path = '../data'

# nirs data
nirs_data_sub1 = os.path.join(data_path, 'neuro/nirs_sub1.csv')
nirs_data_sub2 = os.path.join(data_path, 'neuro/nirs_sub2.csv')

# text data
ceda_data_path = os.path.join(data_path, 'results/ceda-results.csv')
########## may need to change . . . I need dyads->speakers data.
meta_data_path = os.path.join(data_path, 'meta_data/CC_Post_105dyads.csv')

### Import main data

In [None]:
df = pd.read_csv(ceda_data_path)
df.head(20)

Correcting/setting up temporal differences

In [None]:
def to_NIRs_freq(t):
    timestamp_pieces = t.split(':')
    t = float(timestamp_pieces[-1])
    if len(timestamp_pieces) > 1:
        t+= (float(timestamp_pieces[-2]) * 60)
    if len(timestamp_pieces) > 2:
        t += (float(timestamp_pieces[-3]) * (60 * 60))
    
    return t * 5.0863

In [None]:
import warnings; warnings.filterwarnings('ignore')

for dyad in tqdm(df['file'].unique()):
    sub = df.loc[df['file'].isin([dyad])]
    
    if sub['timestamp'].loc[sub.index[0]] in [np.nan, None]:
        sub['timestamp'].loc[sub.index[0]] = '00:00'
        df['timestamp'].loc[sub.index[0]] = '00:00'
    
    no_markers = sub.loc[sub['timestamp'].isna()].index
    has_marker = sub.loc[~sub['timestamp'].isna()].index
    
    for i in no_markers:
        marker = has_marker[has_marker<i]
        if len(marker) > 0:
            marker = marker[-1]
            df['timestamp'].loc[i] = df['timestamp'].loc[marker]
        
        # elif (i == sub.index[0]):
        #     df['timestamp'].loc[i] = '00:00'
        
df.head(20)

In [None]:
df['NIRS_time'] = [to_NIRs_freq(t) for t in tqdm(df['timestamp'].values)]

In [None]:
df[['file', 'NIRS_time']].head()

In [None]:
df['next_NIRS_time'] = None

for dyad in tqdm(df['file'].unique()):
    sub = df.loc[df['file'].isin([dyad])]
    for i in sub.index:
        t_ = sub['NIRS_time'].loc[i]
        s_ = sub['NIRS_time'].loc[sub['NIRS_time'] > t_].values
        
        if len(s_) > 0:
            df['next_NIRS_time'].loc[i] = s_[0]
        
        elif i == sub.index[-1]:
            df['next_NIRS_time'].loc[i] = 1e9
        
        else:
            df['next_NIRS_time'].loc[i] = sub['NIRS_time'].max()
        
df[['file','NIRS_time', 'next_NIRS_time']].head(20)

In [None]:
df['next_next_NIRS_time'] = None

for dyad in tqdm(df['file'].unique()):
    sub = df.loc[df['file'].isin([dyad])]
    for i in sub.index:
        t_ = sub['NIRS_time'].loc[i]
        s_ = sub['NIRS_time'].loc[sub['NIRS_time'] > t_].unique()
        
        if len(s_) > 1:
            df['next_next_NIRS_time'].loc[i] = s_[1]
        
        elif i == sub.index[-1]:
            df['next_next_NIRS_time'].loc[i] = 1e9
        
        else:
            df['next_next_NIRS_time'].loc[i] = sub['next_NIRS_time'].loc[i]
        
df[['file','NIRS_time', 'next_NIRS_time', 'next_next_NIRS_time']].head(20)

In [None]:
df['NIRS_time'] = [np.ceil(t) for t in tqdm(df['NIRS_time'].values)]
df['next_NIRS_time'] = [np.floor(t) for t in tqdm(df['next_NIRS_time'].values)]
df['next_next_NIRS_time'] = [np.floor(t) for t in tqdm(df['next_next_NIRS_time'].values)]
df[['file','NIRS_time', 'next_NIRS_time', 'next_next_NIRS_time']].head(20)

### Process

Per conversation ID, streaming fNIRs data and grabbing the appropriate speakers.

In [None]:
unified_neuro_home = os.path.join(data_path, 'unified_neuro')
fnirs_files = [
    os.path.join(unified_neuro_home, f) for f in os.listdir(unified_neuro_home) 
    if (not f.startswith('._'))
]
# fnirs_files

In [None]:
channel_indexes = {
    "L_lPFC": ['ch_'+str(i) for i in range(1,6+1)],
    "mPFC": ['ch_'+str(i) for i in range(7,14+1)],
    "R_lPFC": ['ch_'+str(i) for i in range(15,20+1)],
    "L_SPL": ['ch_'+str(i) for i in range(21,23+1)],
    "L_TPJ": ['ch_'+str(i) for i in range(24,30+1)],
    "R_SPL": ['ch_'+str(i) for i in range(31,33+1)],
    "R_TPJ": ['ch_'+str(i) for i in range(34,40+1)],
}

In [None]:
channel_to_region_names = dict()
for k, v in channel_indexes.items():
    for i,channel in enumerate(v):
        channel_to_region_names[channel] = k + '_ch_' + str(i+1)

In [None]:
df[list(channel_indexes.keys())] = None

for dyad in tqdm(df['file'].unique()):
    sub = df.loc[df['file'].isin([dyad])].index
    fnirs_data = [f for f in fnirs_files if str(dyad) in f][0]
    fnirs_data = pd.read_csv(fnirs_data)

    # make the next time stamp for all the conversation, for the last turn, 
    #  be the end of the conversation...
    # if df['next_NIRS_time'].loc[sub[-1]] == df['next_next_NIRS_time'].loc[sub[-1]]:
    #     df['next_NIRS_time'].loc[sub[-1]] = len(fnirs_data)
    df['next_next_NIRS_time'].loc[sub[-1]] = len(fnirs_data)

    ## DO SOMETHING
    for i in sub:
        for k,v in channel_indexes.items():
            values = fnirs_data[v].loc[
                (fnirs_data['Time'] >= df['NIRS_time'].loc[i])
                & (fnirs_data['Time'] < df['next_NIRS_time'].loc[i])
            ].values

            values = np.nan_to_num(values)
            # denom = (values != 0.0).sum(axis=-1)

            df[k].loc[i] = values.sum(axis=-1).mean()
    

In [None]:
df[['next_'+ch for ch in list(channel_indexes.keys())]] = None

for dyad in tqdm(df['file'].unique()):
    sub = df.loc[df['file'].isin([dyad])].index
    fnirs_data = [f for f in fnirs_files if str(dyad) in f][0]
    fnirs_data = pd.read_csv(fnirs_data)
    
    # make the next time stamp for all the conversation, for the last turn, 
    #  be the end of the conversation...
    # if df['next_NIRS_time'].loc[sub[-1]] == df['next_next_NIRS_time'].loc[sub[-1]]:
    #     df['next_NIRS_time'].loc[sub[-1]] = len(fnirs_data)
    df['next_next_NIRS_time'].loc[sub[-1]] = len(fnirs_data)

    ## DO SOMETHING
    for i in sub:
        for k,v in channel_indexes.items():
            values = fnirs_data[v].loc[
                (fnirs_data['Time'] >= df['NIRS_time'].loc[i])
                & (fnirs_data['Time'] < df['next_next_NIRS_time'].loc[i])
            ].values
            
            values = np.nan_to_num(values)
            # denom = (values != 0.0).sum(axis=-1)
            
            df['next_'+k].loc[i] = values.sum(axis=-1).mean()

In [None]:
df[['file', 'nx', 'ny', 'Hxy']+['next_'+ch for ch in list(channel_indexes.keys())]].head()

In [None]:
df.to_csv(ceda_data_path.replace('.csv', '-with_fNIRs.csv'), index=False, encoding='utf-8')