In [None]:
from pydub import AudioSegment
from pydub.playback import play
import os, sys
import numpy as np
import pandas as pd

In [None]:
EXPERIMENT_NAME = 'pilot-version-04'
TASK = 'black'

percent_sampled = 0.25 # number of items to sample for each subject
n_counts_per_item = 25 # number of times items are seen across subjects

# set directories
base_dir = '/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavioral/'
preproc_dir = os.path.join(base_dir, 'stimuli', 'preprocessed')
task_out_dir = os.path.join(base_dir, 'stimuli', 'presentation_orders', EXPERIMENT_NAME, TASK)

if not os.path.exists(task_out_dir):
    os.makedirs(task_out_dir)

# load preprocessed transcript
df_task_preproc_fn = os.path.join(preproc_dir, TASK, f'{TASK}_transcript_preprocessed')
df_preproc = pd.read_csv(f'{df_task_preproc_fn}.csv')

# find indices for presentation and set number of items each subject sees
nwp_indices = np.where(df_preproc['NWP_Candidate'])[0]
n_items_per_subject = round(len(nwp_indices) * percent_sampled)

In [None]:
def get_cut_times(df, start_idx, end_idx):
    
    onset = df.iloc[start_idx]['Onset']
    offset = df.iloc[end_idx]['Onset']
    
    duration = offset - onset
    
    return onset, offset, duration

In [None]:
stim = AudioSegment.from_file(stim_fn)
stim_length = stim.duration_seconds

In [None]:
task = 'black'

base_dir = '/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavioral/'
stim_dir = os.path.join(base_dir, 'stimuli')
out_dir = os.path.join(stim_dir, 'cut_audio', task)

stim_fn = os.path.join(stim_dir, 'audio', f'{task}_audio.wav')

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
for i, curr_idx in enumerate(nwp_indices):
    # if we're on the first index we use the start of the file
    if i == 0:
        _, offset, _ = get_cut_times(df_preproc, 0, curr_idx)
        onset = 0
        duration = offset
    elif i == len(nwp_indices):
        onset, _, _ = get_cut_times(df_preproc, curr_idx, curr_idx)
        duration = stim_length - onset
    else:
        prev_idx = nwp_indices[i-1]
        onset, _, duration = get_cut_times(df_preproc, prev_idx, curr_idx)
    
    out_fn = os.path.join(out_dir, f'{task}_segment-{str(i+1).zfill(5)}.wav')
    cmd = f'ffmpeg -y -ss {onset} -t {duration} -i {stim_fn} {out_fn}'
    subprocess.run(cmd, shell=True)

In [None]:
cmd = f'ffmpeg -ss 792.350000 -t 2 -i {stim_fn} end.wav'

subprocess.run(cmd, shell=True)

In [None]:
df_preproc.iloc[nwp_indices]

In [None]:
import subprocess

In [None]:
cmd = f'ffmpeg -ss 10 -t 10 -i {stim_fn} test.mp4'

subprocess.run(cmd, shell=True)

In [None]:
audio = AudioSegment.from_wav(stim_fn)

In [None]:
# pydub does things in milliseconds
ten_seconds = 10 * 1000
twenty_seconds = 20 * 1000

cut = audio[ten_seconds:twenty_seconds]


In [None]:
cut.export("test.mp4", format="mp4")

# Test consecutive constraint

In [1]:
import sys, os
import numpy as np
import pandas as pd
from collections import Counter
import argparse

sys.path.append('../utils/')

from config import *
from randomization_utils import create_balanced_orders, get_consecutive_list_idxs, sort_consecutive_constraint


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--experiment_name', type=str)
parser.add_argument('-t', '--task', type=str)
parser.add_argument('-p', '--percent_sampled', type=float, default=0.25) # percentage of items to sample for each subject
parser.add_argument('-c', '--n_participants_per_item', type=int, default=25) # number of times items are seen across subjects
parser.add_argument('-i', '--consecutive_spacing', type=int, default=2) # number of times items are seen across subjects

p = parser.parse_args([f'-ntest', f'-tblack'])
                   
                   

In [3]:
# set directories
preproc_dir = os.path.join(BASE_DIR, 'stimuli', 'preprocessed')
task_out_dir = os.path.join(BASE_DIR, 'stimuli', 'presentation_orders', p.experiment_name, p.task, 'preproc')

if not os.path.exists(task_out_dir):
    os.makedirs(task_out_dir)

# load preprocessed transcript
df_task_preproc_fn = os.path.join(preproc_dir, p.task, f'{p.task}_transcript-preprocessed')
df_preproc = pd.read_csv(f'{df_task_preproc_fn}.csv')

In [16]:
# find indices for presentation and set number of items each subject sees
nwp_indices = np.where(df_preproc['NWP_Candidate'])[0]
n_items_per_subject = round(len(nwp_indices) * p.percent_sampled)

# create experiment structure for subjects --> sort the indices
subject_experiment_orders = create_balanced_orders(items=nwp_indices, n_elements_per_subject=n_items_per_subject, use_each_times=p.n_participants_per_item)
subject_experiment_orders = list(map(sorted, subject_experiment_orders))


Creating orders for 100 subjects


In [19]:
%load_ext autoreload
%autoreload 2
from preproc_utils import sort_consecutive_constraint, check_consecutive_spacing

orders = sort_consecutive_constraint(subject_experiment_orders, consecutive_spacing=2)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import os, sys
from praatio import textgrid as tgio

# sys.path.append('../utils/')

from config import *
from preproc_utils import update_dataframe_from_praat, dataframe_to_textgrid, get_cut_times, cut_audio_segments

[nltk_data] Downloading package tagsets to /dartfs-
[nltk_data]     hpc/rc/home/w/f003rjw/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package stopwords to /dartfs-
[nltk_data]     hpc/rc/home/w/f003rjw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to /dartfs-
[nltk_data]     hpc/rc/home/w/f003rjw/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /dartfs-
[nltk_data]     hpc/rc/home/w/f003rjw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /dartfs-
[nltk_data]     hpc/rc/home/w/f003rjw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /dartfs-
[nltk_data]     hpc/rc/home/w/f003rjw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [24]:
task = 'black'

# set directories
stim_dir = os.path.join(BASE_DIR, 'stimuli')
preproc_dir = os.path.join(stim_dir, 'preprocessed')

# load preprocessed transcript and find indices that are to be predicted
df_preproc_fn = os.path.join(preproc_dir, task, f'{task}_transcript-preprocessed.csv')
df_preproc = pd.read_csv(df_preproc_fn)

## Segments are defined as follows
##  - Start = where a previous segment left of --> will contain the prior segment's predicted word
##  - Stop = ending right before a word prediction
## Therefore adjusting the end of one will cause a shift in the subsequent segment time

# create dataframe that accompanies written audio segments
# get segment file if it exists
df_segments_fn = os.path.join(preproc_dir, task, f'{task}_transcript-segments.csv')
praat_fn = os.path.join(preproc_dir, task, f'{task}_transcript-praat.TextGrid')

# if a textgrid file exists, we open it and use it in to adjust the times
if os.path.exists(praat_fn):
    tg = tgio.openTextgrid(praat_fn, False)
#     df_preproc = update_dataframe_from_praat(df_preproc, tg)
# else:

In [34]:
df = df_preproc.copy()

for idx in range(len(df)):

    word = tg.getTier('word').entries[idx]

    df.loc[idx, 'Onset'] = word.start
    df.loc[idx, 'Offset'] = word.end
    df.loc[idx, 'Duration'] = word.end - word.start

In [35]:
df.loc[:25]

Unnamed: 0,Word_Written,Case,POS,POS_Definition,Punctuation,Stop_Word,Word_Vocab,Onset,Offset,Duration,Named_Entity,NWP_Candidate
0,So,success,RB,adverb,,True,So,0.24,0.63,0.39,False,False
1,I,success,PRP,"pronoun, personal",,True,I,0.68,1.26,0.58,False,False
2,was,success,VBD,"verb, past tense",,True,was,1.96,2.273722,0.313722,False,False
3,a,success,DT,determiner,,True,a,2.273722,2.45,0.176278,False,False
4,junior,success,JJ,"adjective or numeral, ordinal",,False,junior,2.46,3.14,0.68,False,True
5,in,success,IN,"preposition or conjunction, subordinating",,True,in,3.14,3.41,0.27,False,False
6,college,success,NN,"noun, common, singular or mass",,False,college,3.41,4.2,0.79,False,True
7,when,success,WRB,Wh-adverb,,True,when,4.79,5.02,0.23,False,False
8,I,success,PRP,"pronoun, personal",,True,I,5.02,5.09,0.07,False,False
9,got,success,VBD,"verb, past tense",,False,got,5.09,5.33,0.24,False,True


In [175]:
df_segments = pd.read_csv(df_segments_fn)

In [36]:
prediction_idxs = np.where(df_preproc['NWP_Candidate'])[0]


1540

In [192]:
last_idx = df_preproc.index[-1]
if last_idx not in prediction_idxs:
    print ('Here')
    prediction_idxs = np.append(prediction_idxs, last_idx)

In [193]:
prediction_idxs

array([   4,    6,    9,   11,   12,   13,   16,   19,   24,   26,   28,
         32,   34,   35,   36,   39,   40,   46,   49,   51,   52,   53,
         56,   58,   60,   61,   64,   70,   75,   77,   81,   85,   87,
         90,   91,   92,   95,   99,  100,  101,  102,  104,  106,  111,
        113,  116,  117,  118,  119,  123,  125,  129,  131,  133,  135,
        136,  143,  144,  148,  150,  154,  156,  157,  159,  163,  164,
        166,  167,  171,  172,  174,  175,  176,  179,  180,  181,  184,
        185,  187,  190,  192,  194,  196,  198,  201,  203,  208,  212,
        214,  216,  217,  220,  221,  222,  224,  225,  226,  227,  235,
        236,  239,  244,  248,  253,  255,  257,  259,  263,  265,  266,
        268,  269,  272,  276,  279,  281,  285,  295,  296,  297,  299,
        302,  305,  308,  312,  315,  317,  319,  323,  324,  325,  327,
        328,  329,  331,  332,  334,  337,  340,  341,  343,  344,  346,
        347,  349,  354,  355,  356,  358,  361,  3

In [40]:
split_dfs = np.split(df_preproc, prediction_idxs)

In [147]:
sample_columns = ['Word_Written', 'Punctuation', 'Onset', 'Offset', 'Duration']

sampled = split_dfs[0][sample_columns]

In [156]:
for df in split_dfs:
    if df.empty:
        print (df)

In [165]:
df['Onset'].iloc[0]

792.35

In [148]:
sampled[['Norm_Onset', 'Norm_Offset']] = sampled[['Onset', 'Offset']] - sampled['Onset'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [170]:
df = pd.DataFrame(columns=['filename', 'critical_word', 'transcript_index', 'clip_words'])

In [171]:
df.loc[len(df)] = {
    'filename': 'test',
    'critical_word': 'test',
    'transcript_index': 1,
    'clip_words': sample_json
}

In [174]:
df['clip_words'].iloc[0]

'[{"Word_Written":"So","Punctuation":" ","Onset":0.24,"Offset":0.63,"Duration":0.39,"Norm_Onset":0.0,"Norm_Offset":0.39},{"Word_Written":"I","Punctuation":" ","Onset":0.68,"Offset":1.26,"Duration":0.58,"Norm_Onset":0.44,"Norm_Offset":1.02},{"Word_Written":"was","Punctuation":" ","Onset":1.96,"Offset":2.3,"Duration":0.34,"Norm_Onset":1.72,"Norm_Offset":2.06},{"Word_Written":"a","Punctuation":" ","Onset":2.3,"Offset":2.45,"Duration":0.15,"Norm_Onset":2.06,"Norm_Offset":2.21}]'

In [143]:
split_dfs[0].loc[0, 'test'] = 1

In [145]:
split_dfs[0]

Unnamed: 0,Word_Written,Case,POS,POS_Definition,Punctuation,Stop_Word,Word_Vocab,Onset,Offset,Duration,Named_Entity,NWP_Candidate,test
0,So,success,RB,adverb,,True,So,0.24,0.63,0.39,False,False,1.0
1,I,success,PRP,"pronoun, personal",,True,I,0.68,1.26,0.58,False,False,
2,was,success,VBD,"verb, past tense",,True,was,1.96,2.3,0.34,False,False,
3,a,success,DT,determiner,,True,a,2.3,2.45,0.15,False,False,


In [134]:
sample_json = sampled.to_json(orient='records')

In [168]:
sample_json

'[{"Word_Written":"So","Punctuation":" ","Onset":0.24,"Offset":0.63,"Duration":0.39,"Norm_Onset":0.0,"Norm_Offset":0.39},{"Word_Written":"I","Punctuation":" ","Onset":0.68,"Offset":1.26,"Duration":0.58,"Norm_Onset":0.44,"Norm_Offset":1.02},{"Word_Written":"was","Punctuation":" ","Onset":1.96,"Offset":2.3,"Duration":0.34,"Norm_Onset":1.72,"Norm_Offset":2.06},{"Word_Written":"a","Punctuation":" ","Onset":2.3,"Offset":2.45,"Duration":0.15,"Norm_Onset":2.06,"Norm_Offset":2.21}]'