In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
    
import os, sys, glob
import json
import re
import numpy as np
import pandas as pd
from natsort import natsorted

import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kendalltau
import matplotlib.colors as clr

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multitest import multipletests

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/utils/')

from config import *

import analysis_utils as utils

2024-12-08 09:35:04.879289: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-08 09:35:04.879361: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-08 09:35:04.885404: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-08 09:35:05.483506: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
task_list = ['black', 'wheretheressmoke', 'howtodraw']

gentle_dir = os.path.join(BASE_DIR, 'stimuli/gentle')
results_dir = os.path.join(BASE_DIR, 'derivatives/results/behavioral/')
preproc_dir = os.path.join(BASE_DIR, 'stimuli/preprocessed')
prosody_dir = os.path.join(BASE_DIR, 'stimuli/prosody')

## Load all task selected words and prosody

In [6]:
REMOVE_WORDS = ["sp", "br", "lg", "cg", "ls", "ns", "sl", "ig", "{sp}", "{br}", "{lg}", 
 "{cg}", "{ls}", "{ns}", "{sl}", "{ig}", "SP", "BR", "LG", "CG", "LS",
 "NS", "SL", "IG", "{SP}", "{BR}", "{LG}", "{CG}", "{LS}", "{NS}", "{SL}", "{IG}", "pause"]

def get_prosody_metrics(index, prosody_raw, boundary_raw, n_prev):

    columns = ['prominence_mean', 'prominence_std', 'boundary_mean', 'boundary_std']
    columns = [f'{col}_nprev-{n_prev}' for col in columns]
    df = pd.DataFrame(columns=columns)
    
    if (index - n_prev >= 0):
        n_prev_prosody = prosody_raw[index:index+n_prev]
        n_prev_boundary = boundary_raw[index:index+n_prev]

        # get mean and std of n_prev words prosody
        prosody_mean = n_prev_prosody.mean()
        prosody_std = n_prev_prosody.std()

        # relative_prosody = prosody_raw[idx+n_prev] - prosody_mean
        # relative_prosody_norm = relative_prosody / prosody_std

        # # get mean and std of n_prev prosodic boundaries
        # boundary_mean = n_prev_boundary.mean()
        # boundary_std = n_prev_boundary.std()

def calculate_prosody_metrics(df_prosody, n_prev=3, remove_characters=[], zscore=False):
    # Extract raw values
    prosody_raw = df_prosody['prominence'].to_numpy()
    boundary_raw = df_prosody['boundary'].to_numpy()

    if zscore:
        prosody_raw = stats.zscore(prosody_raw)
    
    # get mean of past n_words
    indices = np.arange(len(prosody_raw))
    # start_idxs = indices - n_prev

    # go through the past x words 
    all_items = []
    
    for idx in tqdm(indices):


        # get the prosody of the n_prev words
        if idx >= 0:
            n_prev_prosody = prosody_raw[idx:idx+n_prev]
            n_prev_boundary = boundary_raw[idx:idx+n_prev]
            
            # get mean and std of n_prev words prosody
            prosody_mean = n_prev_prosody.mean()
            prosody_std = n_prev_prosody.std()

            relative_prosody = prosody_raw[idx+n_prev] - prosody_mean
            relative_prosody_norm = relative_prosody / prosody_std

            # get mean and std of n_prev prosodic boundaries
            boundary_mean = n_prev_boundary.mean()
            boundary_std = n_prev_boundary.std()
            
        else:
            prosody_mean = prosody_std = relative_prosody = relative_prosody_norm = np.nan
            boundary_mean = boundary_std = np.nan
        
        all_items.append(
            (prosody_mean, prosody_std, relative_prosody, relative_prosody_norm, boundary_mean, boundary_std)
        )

    prosody_mean, prosody_std, relative_prosody, relative_prosody_norm, boundary_mean, boundary_std = zip(*all_items)

    df_prosody['prominence_mean'] = prosody_mean
    df_prosody['prominence_std'] = prosody_std
    df_prosody['relative_prominence'] = relative_prosody
    df_prosody['relative_prominence_norm'] = relative_prosody_norm
    df_prosody['boundary_mean'] = boundary_mean
    df_prosody['boundary_std'] = boundary_std

    # remove non-words
    df_prosody = df_prosody[~df_prosody['word'].isin(remove_characters)].reset_index(drop=True)
    
    return df_prosody

In [8]:
task = 'black'
stim_dir = os.path.join(BASE_DIR, 'stimuli/')

###############################################
######## Load prosody data and process ########
###############################################

# Define column names for prosody data
prosody_columns = ['stim', 'start', 'end', 'word', 'prominence', 'boundary']

# Process prosody -- calculate the average prosody over the past n words
df_prosody = pd.read_csv(os.path.join(stim_dir, 'prosody', f'{task}.prom'), sep='\t', names=prosody_columns)
# df_prosody = calculate_prosody_metrics(df_prosody, n_prev=p.n_words, remove_characters=REMOVE_WORDS)