# Task data preprocessing notebook

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from pathlib import Path
import os

# Experiment info
subject = 'NP122'
block = 'B1'
subject_block = '_'.join([subject, block])

# Shared paths
preproc_root = Path('/data_store2/neuropixels/preproc')
label_dir = preproc_root / subject_block / 'labels'
chunk_wav_dir = preproc_root / subject_block / 'audio_files' / 'clips'


## **1. Extract and denoise audio files**
### **1.1. Read mic and speaker channels from NIDQ and save them as WAV files**

In [None]:
from audio import save_ni_audio

save_ni_audio(preproc_root, subject_block)

### **1.2. USER INTERVENTION: Denoise microphone audio in Audacity**

This part IS MANDATORY

1) Open the *_mic.wav file in Audacity.
2) Select a period of noise.
3) In the menu bar, go to Effect (-> Noise Removal and Repair) -> Noise Reduction. There's a slight difference in menu organization in different versions of Audacity.
4) Click Get Noise Profile. (Click the question mark button at the bottom right corner of the dialog for more instruction.)
5) Select the entire audio track.
5) Go to Effect -> Noise Reduction, and click OK to apply noise reduction.
8) File -> Export -> Export as WAV, and save the audio file as *_mic_denoised.wav in the same folder


## **2. Extract speech and task event labels**
### **2.1. Create the task_timing.csv file**
#### This spreadsheet has four columns
1) Task name - we currently support sentgen and LMV, but can add additional task extractors flexibly. 
2) Task start - start time in seconds
3) Task end - end times in second
4) Curated - if the labels have already been corrected. If so, will skip the task extraction. Useful for adding additional features.

In [None]:
# Edit and specify task timing
if subject_block in ['NP32_B2', 'NP32_B3', 'NP41_B1', 'NP41_B2', 'NP38_B6']:
    task_timing = [('lmv', 0, np.inf)]
elif subject_block == 'NP35_B1': task_timing = [('lmv', 32, 400)]
elif subject_block == 'NP35_B2': task_timing = [('lmv', 110, 730)]
elif subject_block == 'NP40_B1': task_timing = [('semsr', 425, 770)]
elif subject_block == 'NP40_B2': task_timing = [('semsr', 156, 476)]
elif subject_block == 'NP43_B1': task_timing = [('lmv', 0, 7*60+50), ('timit', 8*60+35, 13*60+31)]
elif subject_block == 'NP44_B2': task_timing = [('lmv', 7*60+20, 18*60)]
elif subject_block == 'NP44_B3': task_timing = [('lmv', 2*60+15, 10*60+13)]
elif subject_block == 'NP45_B1': task_timing = [('lmv', 3*60+7, 11*60+3)]
elif subject_block == 'NP45_B2': task_timing = [('lmv', 2*60+20, 10*60+42)]
elif subject_block == 'NP46_B1': task_timing = [('lmv', 4*60+45, 12*60+30)]
elif subject_block == 'NP47_B3': task_timing = [('lmv', 2*60, 10*60+40)]
elif subject_block == 'NP50_B2': task_timing = [('timit', 2*60+45, 6*60+9)]
elif subject_block == 'NP50_B3': task_timing = [('timit', 82, 554), ('lmv', 637, 1164)]
elif subject_block == 'NP51_B1': task_timing = [('lmv', 2*60+34, 10*60)]
elif subject_block == 'NP52_B1': task_timing = [('lmv', 197, 771)]
elif subject_block == 'NP53_B1': task_timing = [('lmv', 94, 512), ('semsr', 668, 1164)]
elif subject_block == 'NP54_B1': task_timing = [('lmv', 285, 852)]
elif subject_block == 'NP55_B1': task_timing = [('lmv', 150, 632), ('semsr', 767, 1134)]
elif subject_block == 'NP55_B2': task_timing = [('semsr', 147, 750)]
elif subject_block == 'NP56_B1': task_timing = [('lmv', 271, 1537)]
elif subject_block == 'NP57_B1': task_timing = [('semsr', 580, 18*60+30), ('timit', 20*60, 28*60)]
elif subject_block == 'NP57_B2': task_timing = [('timit', 12, 467)]
elif subject_block == 'NP58_B2': task_timing = [('lmv', 350, 16*60), ('cv', 16*60+50, 19*60+10)]
elif subject_block == 'NP59_B1': task_timing = [('semsr', 239, 764)]#, ('dimex', 842, 1262)]
elif subject_block == 'NP59_B2': task_timing = [('semsr', 154, 662)]
elif subject_block == 'NP60_B1': task_timing = [('lmv', 239, 975)]
elif subject_block == 'NP61_B1': task_timing = [('semsr', 1.5, 770)]
elif subject_block == 'NP62_B1': task_timing = [('semsr', 560, 794)]
elif subject_block == 'NP62_B2': task_timing = [('semsr', 224, 806)]
elif subject_block == 'NP64_B1': task_timing = [('semsr', 191, 552), ('lmv', 689, 1238)]
elif subject_block == 'NP65_B2': task_timing = [('semsr', 11, 637)]
elif subject_block == 'NP66_B1': task_timing = [
    # ('dimex_intraop_s1', 383, 561), 
    # ('dimex_intraop_s5', 568, 794), 
    ('timit', 839, 1283)
]
elif subject_block == 'NP66_B2': task_timing = [
    # ('dimex_intraop_s1', 327, 505), 
    # ('dimex_intraop_s5', 509, 735), 
    ('timit', 745, 1187), 
    # ('dimex_s2', 1244, 1509)
]
elif subject_block == 'NP67_B1': task_timing = [('timit', 423, 868), ('semsr', 1118 , 1855), ('timit', 2020, 2465)]
elif subject_block == 'NP68_B1': task_timing = [('timit', 161, 470)]
elif subject_block == 'NP69_B1': task_timing = [('lmv', 295, 777), ('semsr', 870, 1259)]
elif subject_block == 'NP69_B2': task_timing = [('lmv', 136, 628), ('semsr', 672, 1014)]
elif subject_block == 'NP70_B2': task_timing = [('semsr', 29, 660)]
elif subject_block == 'NP71_B2': task_timing = [('timit', 0, 8*60),] #('semsr', ?, ?)]
elif subject_block == 'NP72_B1': task_timing = [('timit', 280, 750), ('natstim', 810, 1550, [1, 2])]
elif subject_block == 'NP72_B2': task_timing = [('timit', 3*60+2, 10*60+24), ('natstim', 11*60+22, 18*60+5, [3])]
elif subject_block == 'NP73_B1': task_timing = [('timit', 6*60+40, 14*60+3)]
elif subject_block == 'NP74_B1': task_timing = [('lmv', 225, 694), ('cv', 783, 887)]
elif subject_block == 'NP76_B1': task_timing = [
    #('dimex', 3*60, 10*60+15), 
    ('timit', 890, 1380)]
elif subject_block == 'NP77_B2': task_timing = [('arithmetic', 26, 1031)]
elif subject_block == 'NP78_B1': task_timing = [('lmv', 892, 1363), ('semsr', 1445, 2141)]
elif subject_block == 'NP79_B1': task_timing = [('timit', 162, 606)]
elif subject_block == 'NP79_B2': task_timing = [('timit', 291, 731), ('semsr', 852, 1956)]
elif subject_block == 'NP80_B1': task_timing = [('semsr', 207, 958), ('lmv', 1048, 1555)]
elif subject_block == 'NP81_B1': task_timing = [('lmv', 315, 777)]
elif subject_block == 'NP85_B1': task_timing = [('semsr', 367, 961)]
elif subject_block == 'NP85_B3': task_timing = [('semsr', 111, 649), ('arithmetic', 700, 1060)]
elif subject_block == 'NP86_B1': task_timing = [('semsr', 282, 752)]
elif subject_block == 'NP87_B1': task_timing = [('natstim', 233, 937, [1, 2]), ('semsr', 1093, 1560)]
elif subject_block == 'NP88_B1': task_timing = [('semsr', 66, 520)]
elif subject_block == 'NP88_B2': task_timing = [('semsr', 175, 1031)]
elif subject_block == 'NP89_B1': task_timing = [('semsr', 308, 804), ('arithmetic', 972, 1434)]
elif subject_block == 'NP90_B1': task_timing = [('semsr', 719, 1220)]
elif subject_block == 'NP90_B2': task_timing = [('semsr', 428, 831)]
elif subject_block == 'NP90_B4': task_timing = [('semsr', 10, 401)]
elif subject_block == 'NP91_B1': task_timing = [('semsr', 398, 961), ('arithmetic', 1030, 1542)]
elif subject_block == 'NP92_B2': task_timing = [('arithmetic', 339, 933)]
elif subject_block == 'NP93_B1': task_timing = [('timit', 321, 764), ('semsr', 855, 1261)]
elif subject_block == 'NP94_B1': task_timing = [('lmv', 257, 508), ('lmv', 533, 764), ('arithmetic', 844, 1393), ('semsr', 1515, 2065)]
elif subject_block == 'NP95_B1': task_timing = [('semsr', 268,1162)]
elif subject_block == 'NP96_B1': task_timing = [('sentgen', 645, 1014)]
elif subject_block == 'NP97_B1': task_timing = [('sentgen', 266, 1117), ('semsr', 1300, 1838)]
elif subject_block == 'NP98_B1': task_timing = [('timit', 205, 650)]
elif subject_block == 'NP101_B3': task_timing = [('timit', 147, 584)]
elif subject_block == 'NP102_B2': task_timing = [('nbd_listen', 800, 880), ('ptb_read', 936, 1066), ('nbd_listen', 1088, 1170), ('ptb_read', 1188, 1325)]
elif subject_block == 'NP104_B2': task_timing = [('nbd_listen', 204, 281), ('ptb_read', 314, 483), ('nbd_listen', 525, 611), ('ptb_read', 641, 802)]
elif subject_block == 'NP105_B1': task_timing = [('semsr', 330, 940)]
elif subject_block == 'NP106_B1': task_timing = [('lmv', 176, 881), ('lmv', 979, 1445)] # ('bdg', 1473, 1512)
elif subject_block == 'NP108_B1': task_timing = [('nbd', 250, 250 + 534.706)]
elif subject_block == 'NP111_B1': task_timing = [('ptb_read', 357, 2008)] #, ('dimex', 2087, 2272)]
elif subject_block == 'NP112_B1': task_timing = [('lmv', 260, 890), ('ptb_read', 1060, 1470)]
elif subject_block == 'NP113_B1': task_timing = [('lmv', 140, 1043), ('custom_cv', 1043, 1538), ('mocha', 1538, 1837)]
elif subject_block == 'NP114_B1': task_timing = [('nbd', 0, 1136), ('ptb_read', 1490, 1550)] #, ('lmv', 1688, 2602)]
elif subject_block == 'NP116_B1': task_timing = [('nbd', 0, 750), ('semsr', 1380, 1864)]
elif subject_block == 'NP117_B1': task_timing = [('nbd', 419, 960), ('ptb_read', 1017, 2195)]
elif subject_block == 'NP118_B2': task_timing = [('semsr', 182, 763), ('sentgen', 1044, 1350)]
elif subject_block == 'NP119_B1': task_timing = [('lmv', 254, 1085), ('sentgen', 1890, 2208), ('mocha', 1320, 1725), ('custom_cv', 2310, 2550)]
elif subject_block == 'NP120_B1': task_timing = [('lmv', 400, 953), ('nbd', 1128, 1774), ('ptb_read', 1935, 2571), ('nbd', 2747, 3060)]
elif subject_block == 'NP122_B1': task_timing = [('sentgen', 600, 1015), ('semsr', 1305, 1718), ('lmv', 1783, 2564)]
else: task_timing = [('', 0, np.inf)]

# Create dataframe
task_timing_df = pd.DataFrame({
    'Task name': [k[0] for k in task_timing],
    'Task start': [k[1] for k in task_timing],
    'Task end': [k[2] for k in task_timing],
    'Curated': [False for k in task_timing]
})

# Save dataframe if not already
label_dir.mkdir(parents=True, exist_ok=True)
task_timing_file =  label_dir / 'task_timing.csv'
if False and not task_timing_file.exists():
    task_timing_df.to_csv(task_timing_file)
    print('Saved', task_timing_file)
else:
    task_timing_df.to_csv(task_timing_file)
    print('Not overwriting the existing task_timing.csv file')

# Read the task_timing.csv file
task_timing_df = pd.read_csv(task_timing_file)
print(task_timing_df)

### **2.2. Run task extractor on each task**

Go through each task in the task_timing.csv and get out event timing using that task's timing extractor

If the Curated column of a task is True, then we skip extracting that task's timing.

In [None]:
from scipy.io.wavfile import read
from task_extractor.lmv_extractor import LMVExtractor
from task_extractor.timit_extractor import TIMITExtractor
from task_extractor.semsr_extractor import SemsrExtractor
from task_extractor.rep_extractor import RepExtractor
from task_extractor.natstim_extractor import NatStimExtractor
from task_extractor.sentgen_extractor import SentGenExtractor
from task_extractor.mocha_extractor import MOCHAExtractor

# Reload the task_timing.csv file
task_timing_df = pd.read_csv(task_timing_file)

# Read denoised mic audio
PROD_TASKS = ["mocha", "cv", "semsr", "lmv", "sentgen"]
if np.any([task_name in PROD_TASKS for task_name in task_timing_df['Task name'].unique()]):
    mic_wav_file = preproc_root / subject_block / 'audio_files' / (subject_block + '_mic_denoised.wav')
else:
    mic_wav_file = preproc_root / subject_block / 'audio_files' / (subject_block + '_mic.wav')

sample_rate, mic = read(mic_wav_file)

# Read speaker audio
speaker_wav_file = preproc_root / subject_block / 'audio_files' / (subject_block + '_speaker.wav')
sample_rate, speaker = read(speaker_wav_file)

pdiode_wav_file = preproc_root / subject_block / 'audio_files' / (subject_block + '_pdiode.wav')
sample_rate_pdiode, pdiode = read(pdiode_wav_file)

# Iterate through tasks
for task_name in task_timing_df['Task name'].unique():
    # Get time window(s)
    is_task = task_timing_df['Task name'] == task_name
    t_win = list(zip(task_timing_df['Task start'][is_task], task_timing_df['Task end'][is_task]))
    
    # Skip extraction if results are already curated
    if np.any(task_timing_df['Curated'][is_task] == True):
        print('Skip '+task_name+' since the results have been curated')
        continue
    
    # Run extractor
    task_name = task_name.lower()
    if task_name == 'lmv':
        print('Run '+task_name+' extractor')
        lmv = LMVExtractor(preproc_root, subject_block, t_win)
        lmv.extract_cue(speaker, sample_rate)
        lmv.extract_stim(speaker, sample_rate)
        if subject_block in ['NP58_B2']: lmv.extract_prod(mic, sample_rate, th=10.0)
        else: lmv.extract_prod(mic, sample_rate)
        lmv.write_timing_files()
        
    elif task_name == 'timit':
        print('Run '+task_name+' extractor')
        timit = TIMITExtractor(preproc_root, subject_block, t_win)
        timit.extract_cue(speaker, sample_rate)
        timit.extract_stim(speaker, sample_rate, max_gap=0.31)
        timit.write_timing_files()
        
    elif task_name == 'semsr':
        print('Run '+task_name+' extractor')
        semsr = SemsrExtractor(preproc_root, subject_block, t_win)
        semsr.extract_stim(speaker, sample_rate)
        semsr.extract_prod(mic, sample_rate)
        semsr.write_timing_files()
        
    elif task_name == 'cv':
        print('Run rep extractor for'+task_name)
        rep = RepExtractor(preproc_root, subject_block, t_win, task_name=task_name)
        rep.extract_stim(speaker, sample_rate)
        rep.extract_prod(mic, sample_rate)
        rep.write_timing_files()
        
    elif task_name == "natstim":
        print('Run rep extractor for'+task_name)
        # Note this should be the block order run in the OR!
        # PLease change as appropriate!
        block_order = [1, 2]
        natstim = NatStimExtractor(preproc_root, subject_block, t_win, block_order)
        natstim.extract_cue(speaker, sample_rate)
        natstim.extract_stim(speaker, sample_rate, max_gap=2)
        natstim.write_timing_files()
        
    elif task_name == 'sentgen':
        sentgen = SentGenExtractor(preproc_root, subject_block, t_win)
        # If using the pdiode (speaker fails) set to True + provide the guess time of where the first image appears
        sentgen.extract_fixation(speaker, sample_rate, pdiode=False, guess=600.852)
        sentgen.extract_prod(mic, sample_rate)
        # Modify ending as kissing ('ing') or kisses ('s')
        # Modify the presence of 'the' at the beginning (The boy kisses vs Boy kisses)
        # Modify the presence of 'being' (The boy is being kicked vs The boy is kicked)
        sentgen.write_timing_files(ending='s', the='The', being='')
        
    elif task_name == 'mocha':
        continue # Requires manual intervention, skip for now
        mocha = MOCHAExtractor(preproc_root, subject_block, t_win)
        mocha.extract_events(pdiode, sample_rate_pdiode, mode='log', guess=1331.698)
        mocha.extract_prod(mic, sample_rate)
        mocha.write_timing_files()
        mocha.process_stim_timing_mocha(label_dir / 'mocha' / 'stim_timing.txt')

    elif task_name == 'dimex':
        print('DIMEX has not been implemented as a taskextractor')
        
    elif task_name == 'arithmetic':
        print('ARITHMETIC has not been implemented as a taskextractor')

### **2.3. USER INTERVENTION: Refine speech onsets and offsets in Audacity**

Only files that end with auto.txt need to be refined. The other files should already be correct. 

1) Add labels to soundtrack via File -> Import -> Labels, selecting the speech_*_timing_auto.txt file
2) Examine and adjust spNatStimExtractorch onsets and offsets
2) Correct labels if what's spoken was different
3) Add new labels for additional utterances that may be useful for analysis
4) Save the curated labels via File -> Export -> Export Labels. The name should now be speech_*_timing.txt without the word 'auto'

### **2.4. Combine labels across tasks**

In [None]:
from file_org import label_combine

task_names = set(task_timing_df['Task name'].values)

label_combine(label_dir, task_names)

## **3. Cut speech audio into clips based on onset and offset times**

Depending on the Source, audio clips are extracted from either speaker (for 'stim') or microphone (for other types) audio file.

In [None]:
from audio import chunk_audio

speech_df = pd.read_csv(label_dir / 'combined_speech_labels.csv')
speaker_df = speech_df.loc[speech_df['Source'] =='stim'] # Labels to use from the speaker
mic_df = speech_df.loc[speech_df['Source'] != 'stim']

speaker_wav_file = preproc_root / subject_block / 'audio_files' / (subject_block + '_speaker.wav')
mic_wav_file = preproc_root / subject_block / 'audio_files' / (subject_block + '_mic_denoised.wav')

!rm -rf $chunk_wav_dir # clear the directory

if len(speaker_df) > 0:
    chunk_audio(speaker_df,
                speaker_wav_file,
                chunk_wav_dir,
                label_offset=0,
                audio_offset=0,
                plot_interval=30)

if len(mic_df) > 0:
    chunk_audio(mic_df,
                mic_wav_file,
                chunk_wav_dir,
                label_offset=0,
                audio_offset=0,
                plot_interval=30)

## **4. Extract features from audio**

Most of the following subsections can be excuted on-demand and in parallel.
The exception is the pitch extraction in 4.4, which requires extracted phonetic labels from 4.2.

### **4.1. Estimate articulatory trajectories**

This uses the bootphon articulatory inversion package. For more information check out their github. https://github.com/bootphon/articulatory_inversion

In [None]:
from aai import invert

artics_dir = preproc_root / subject_block / 'artics'

!rm -rf $artics_dir

invert(chunk_wav_dir, artics_dir)

### **4.2. Get peakRate, peakEnv, and env using Yulia's peakRate code**

Run a script that does the matlab peakRate extraction. The results will come out at 100 samples/sec.

In [None]:
from intensity.oganian_2019 import extract_intensity

intens_dir = preproc_root / subject_block / 'intensity'

!rm -rf intens_dir

extract_intensity(chunk_wav_dir, intens_dir)

### **4.3. Extract phonetic labels**

This uses montreal-forced-aligner. Please ask Many for installation instruction if you need them.

NOTE (03/14/2024): You don't need to install MFA, Quinn's environment will work. But, you do need to copy the pretrained model to your `$MFA_ROOT_DIR`. This directory is by default `~/Documents/MFA`. Copy the files in `/home/qgreicius/Documents/MFA/pretrained_models/acoustic` to `$MFA_ROOT_DIR/pretrained_models/acoustic`.

In [None]:
from phoneme.mfa import run_mfa

speech_df = pd.read_csv(label_dir / 'combined_speech_labels.csv')
phone_dir = preproc_root / subject_block / 'phones'
print(speech_df, phone_dir, chunk_wav_dir)
!rm -rf $phone_dir
run_mfa(chunk_wav_dir, speech_df, phone_dir, language='english')

In [None]:
# Log files are generated in your home directory
# If this fails, check the note in the subsection description above.
mfa_log_dir = Path(os.path.expanduser("~")) / 'Documents/MFA/phones_validate_pretrained'
log_files = []
log_files.append(mfa_log_dir / 'validate_pretrained.log')
log_files.append(mfa_log_dir / 'oovs_found_librispeech_lex.txt')
log_files.append(mfa_log_dir / 'unalignable_files.csv')

for log_file in log_files:
    try:
        with open(log_file, 'r') as file:
            content = file.read()
            print(content)
    except FileNotFoundError:
        print(f"File '{log_file}' not found.")

In [None]:
# Use this cell to generate reading textgrids for SentGen and MOCHA tasks
from genTextGrid import generate_textgrid
for task in task_timing_df['Task name'].unique():
    if task in ['mocha', 'sentgen']:
        stimLabels = label_dir / task / 'reading_stim_timing.txt'
        df = pd.read_csv(stimLabels, sep='\t', header=None, names=['start_time', 'end_time', 'text'])
        for _, row in df.iterrows():
            generate_textgrid(subject_block, row['text'], row['start_time'], row['end_time'], preproc_root)

In [10]:
# If `NatStim` was run, replace the textgrids created by MFA with the manually corrected versions.
# NOTE: Atm, this only replaces the `speaker` dirs and not the `all` dir in `{phone_dir}/results`.
if np.any(np.array([tt[0] for tt in task_timing])=="natstim"):
    natstim.set_corrected_textgrids(speech_df, phone_dir)

#### **Before proceeding**
#### Always check if there are any unaligned clips. If there are that usually means some labels were off. Also check and fix any OOVs.

In [18]:
from phoneme.mfa import pool_speakers

pool_speakers(phone_dir)

### **4.4. Extract pitch features**
#### NOTE: THIS STEP REQUIRES THE COMPLETION OF 4.3

In [None]:
from pitch.straight import extract_pitch

tg_dir = preproc_root / subject_block / 'phones/results/all'
pitch_dir = preproc_root / subject_block / 'pitch'

!rm -rf $pitch_dir

extract_pitch(chunk_wav_dir, tg_dir, pitch_dir)

## SEGUE - Change permissions on all dirs created above.

In [None]:
cmd = f"find /data_store2/neuropixels/preproc/{subject_block} -user $(whoami)" + " -exec chgrp neuropixels {} + -exec chmod 775 {} +"
os.system(cmd)

### **4.5. Extract formants using PRAAT**
#### WARNING: THIS STEP REQUIRES USER INTERVENTION AS THE SCRIPT HAS TO BE RUN LOCALLY
it'd be great if it didnt :sob:

In [None]:
# Step 1: Change the praat template: 
# Set the Mount point (this will need to be run locally)
newls = []
numbered_wav_fp = str(numbered_wavdir) +'/'
with open('./preproc_func/formants/praat_template.txt', 'r') as f: 
    l = f.readlines()
    newls = [ll.replace('{filepath}', numbered_wav_fp) for ll in l]
    newls = [ll.replace('{mount_point}', mount_point) for ll in newls]
    
with open(f'./preproc_func/formants/praat_scripts/praat_{subject}_{block}.txt', 'w') as f: 
    f.writelines(newls)

#### Instructions for formant extraction from here: 
1. Open Praat
2. On mac, click on Praat, and hit Open Praat Script
3. Load up the script
4. Hit run script. 

In [None]:
formant_fp = preproc_root /  subject_block / 'formants'
!rm -rf $formant_fp
formant_fp.mkdir(parents=True, exist_ok=True)

In [None]:
from formants.postprocess_formants import postprocess_formants

In [None]:
from file_org import rename_files
postprocess_formants(numbered_wavdir, formant_fp)
rename_files(str(formant_fp), ind_to_time)

### **4.6. Get spectrograms**
- Please note that this can be pretty slow

In [None]:
spec_dir = preproc_root / subject_block / 'spectrograms' 
# !rm -rf $spec_dir
spec_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Get the audio files...
# Run the TIMIT spectrogram extraction code on em. 
from spectrograms.run_spectrogram_extraction import setup_spectrogram_script
setup_spectrogram_script(str(chunk_wav_dir),str(spec_dir))

In [None]:
# Process the results: 
from scipy.io import loadmat

outstruct = loadmat(os.path.join(spec_dir, 'results.mat'))

import matplotlib.pyplot as plt

data = outstruct['out'][0]

for dind in range(len(data)):
    if data[dind][0].shape[-1] == 0: 
        continue
    else:
        spec = data[dind][0].T
        file = data[dind][1][0]
        start_time = float(file.split('_')[1])
        end_time = float(file.split('_')[-1].replace('.wav', '')) 
        np.save(os.path.join(spec_dir, ('stim_%.3f_%.3f' %(start_time, end_time))), spec[100:-100])

### **4.7 Timitification of the features (get them to match the TIMIT output files (place, manner labels mostly)**

## **5. Collect all the files you created, and read them out!**

In [None]:
import os

In [None]:
def adjust_timings(file, start, end, expected_offset):
    """
    Inputs: file - the full filepath
    start - the start time that the file currently has
    end - the end time that the file currently has
    expected_offsets - how far off we routinely expect the file to be - this is a dict.
    """
    try:
        arr = np.load(file)
    except Exception:
        print('could not load file', file)
        return
    expected_shape = int(np.round(end-start, 2)*100)
   
    if not np.max(arr.shape) == expected_shape:
        start = start + expected_offset
        new_end = np.round(start, 3) + np.max(arr.shape)/100
        base = file.split('/')[:-1]
        base = '/'.join(base)
        newfile = 'stim_%.3f_%.3f.npy' %(start, new_end)
        newfile = str(os.path.join(base, newfile))
        os.rename(str(file),  newfile)

In [None]:
# Make a mapping from the feature names, to the dir within that feature names output that has results!
resdir_dict = {
    'acoustic_feats':None,
    'artics':'artic_files/F01_indep_Haskins_loss_90_filter_fix_bn_False_0_setting2',
    'formants':None,
    'phones':'results/speaker1',
    'pitch':None,
    'spectrograms':None,
}

resfeat_dict = {
    'acoustic_feats':['peakRate', 'peakEnv', 'env', 'F0_raw'],
    'artics':['ttx', 'tty', 'tdx', 'tdy', 'tbx', 'tby', 'lix', 'liy', 'ulx', 'uly', 'llx', 'lly', 'la', 
             'pro', 'tbcl', 'vx', 'vy'], #Note the last one is lip aperture. 
    'formants':['f1', 'f2', 'f3'],
    'pitch':['rel-pitch', 'pitchMin', 'pitchMax', 'pitchUp', 'pitchDown'],
    'spectrograms':['spec_%d' %k for k in range(80)], # Not sure the frequency bins as of now.
    'phones':None # need to use textgrids still for these :D 
}

expected_offsets = {
    'formants':0.025
}

def time_from_resfile(file):
    """
    given a file, e.g. stim_t0_t1.mat etc
    returns the t0, t1 tuple as a set of floats.
    """
    t0 = None
    t1 = None
    if len(file.split('_')) == 3:
        t0 = float(file.split('_')[1])
        t1 = float('.'.join(file.split('_')[2].split('.')[:2]))
    
    elif len(file.split('_')) == 2:
        t0 = float(file.split('_')[0])
        t1 = float('.'.join(file.split('_')[1].split('.')[:2]))
        
    return t0, t1
    
basefp  = str(preproc_root/ subject_block)

# Housekeeping - correct filetimes

for k, v in resdir_dict.items():
    if v is None: 
        resfp = os.path.join(basefp, k)
    else: 
        resfp = os.path.join(basefp, k, v)
    files = sorted(os.listdir(resfp))
    files = [f for f in files if not f == 'results.mat']
    features_loaded = resfeat_dict[k]
    if not k == 'phones':
        for file in files:
            adjust_timings(os.path.join(resfp,  file), 
                           time_from_resfile(file)[0], time_from_resfile(file)[1], 
                          expected_offsets.get(k, 0))

    ### Adjust the timings based on whats in the files

In [None]:
for k, v in resdir_dict.items():
    if not k == 'pitch':
        continue
    if v is None: 
        resfp = os.path.join(basefp, k)
    else: 
        resfp = os.path.join(basefp, k, v)
    files = sorted(os.listdir(resfp))
    files = [f for f in files if not f == 'results.mat']
    print(len(files))
    break
    features_loaded = resfeat_dict[k]
    if not k == 'phones':
        for file in files:
            adjust_timings(os.path.join(resfp,  file), 
                           time_from_resfile(file)[0], time_from_resfile(file)[1], 
                          expected_offsets.get(k, 0))