# Wrangle Buckeye Ratings

## Goal

We need to find monologues from speakers that are in range with Anna's Corpus (16.855625 - 77.089875 secs).
This entails iterating through each speaker, their tracks, and their turns, to identify segments that might fit the time restriction. Let's find their turns first. 

In [1]:
import buckeye
import pandas as pd

## Find specific speaker

In [2]:
from tqdm import tqdm
import librosa
import soundfile

In [3]:
def trim(turn):
    '''
    Trim the <> tokens from beginning and end of turns
    return: trimmed_txt, front_trim, end_trim
    '''
        
    front_cursor = 0
    back_cursor = -1
        
    turn_list = turn.split(' ')
        
    try:## trim front
        while ("<" in turn_list[front_cursor]):
            front_cursor += 1
            
        while ("<" in turn_list[back_cursor]):
            back_cursor -= 1
    
    except:
        return [], turn_list[:front_cursor], []
            
    front_trim = turn_list[:front_cursor]    
        
    if back_cursor == -1:
        trimmed_txt = turn_list[front_cursor:]
        end_trim = []
    else:
        trimmed_txt = turn_list[front_cursor:back_cursor+1]
        end_trim = turn_list[back_cursor+1:]
        
    #print(trimmed_txt, front_trim, end_trim)
        
    if trimmed_txt[-1] == '':
        return trimmed_txt[:-1], front_trim, end_trim
        
    return trimmed_txt, front_trim, end_trim

In [4]:
def turn_segments(track):
    '''
    Find segments of valid turns in a track
    
    input: 
    track
    
    output:
    df containing turns and their time markers
    
    '''
    
    def get_symbol(word):
        '''
        Get a word and find entry/orthography
        '''
        try:
            return word.orthography
        except:
            return word.entry
    
    def five_symbol_match(match_target, words):
        '''
        given a match_target list, find the positions of a three word match
        '''
        
        pos = 0
        
        while True:
            
            try:
                assert get_symbol(words[pos]) == match_target[0]
                assert get_symbol(words[pos+1]) == match_target[1]
                assert get_symbol(words[pos+2]) == match_target[2]
                assert get_symbol(words[pos+3]) == match_target[3]
                assert get_symbol(words[pos+4]) == match_target[4]
                return {'strt':pos, 'end':pos+4}
                
            except:
                pos += 1
                if pos > len(words):
                    raise Exception('No Match')
            
    turns = {
        'words':[],
        'strt':[],
        'strt_t':[],
        'end':[],
        'end_t':[],
        'utt':[],
        'dur':[],
        'word_max':[]
    }
    
    ### iterate through each turn
    for turn in track.txt:
        
        ## for each turn, trim it first
        trimmed_turn, front_trim, end_trim = trim(turn)
        #print(trimmed_turn)
        
        ## if turn has less than 10 symbols after trim, continue to next turn
        if len(trimmed_turn) < 10:
            continue
            
        else:
            
            try:
                ## attempt to start 3 word symbol match
                strt = five_symbol_match(trimmed_turn[:5], track.words)['strt'] ## make sure strt right place
                end = five_symbol_match(trimmed_turn[-5:], track.words)['end'] ## make sure end right place

                ## that the strt pointer is the first symbol of trimmed turn
                assert get_symbol(track.words[strt]) == trimmed_turn[0] 
                ## that the end pointer is the end symbol of the trimmed turn
                assert get_symbol(track.words[end]) == trimmed_turn[-1]

                word_list = track.words[strt:end + 1]

                assert trimmed_turn[0] == get_symbol(word_list[0])
                assert trimmed_turn[-1] == get_symbol(word_list[-1])
                
                turns['words'].append(' '.join([get_symbol(word) for word in word_list]))
                ## end symbol of trim should be the end symbol of whatever we added to words
                turns['strt'].append(strt)
                turns['strt_t'].append(track.words[strt].beg)
                turns['end'].append(end)
                turns['end_t'].append(track.words[end].end)
                turns['utt'].append(len(word_list))
                turns['dur'].append(track.words[end].end - track.words[strt].beg)
                turns['word_max'].append(max([word.dur for word in word_list]))
                
            except Exception as e:
                print(e)
                continue      
    
    return pd.DataFrame(turns)

In [5]:
from tqdm import tqdm

In [6]:
track_segs = []
corpus = buckeye.corpus('/mnt/cube/Datasets/buckeye_zips/', load_wavs = True)

for speaker in corpus:
    print(speaker)
    track_id = 0
    for track in speaker:
        #print(track)
        track_df = turn_segments(track)
        track_df['speaker'] = speaker.name
        track_df['track'] = track.name
        track_df['track_id'] = track_id
        track_segs.append(track_df)
        
        track_id += 1
        
track_segs = pd.concat(track_segs)

<Speaker s01 (f, y)>
<Speaker s02 (f, o)>
<Speaker s03 (m, o)>
<Speaker s04 (f, y)>
<Speaker s05 (f, o)>
<Speaker s06 (m, y)>
No Match
<Speaker s07 (f, o)>
No Match
<Speaker s08 (f, y)>
No Match
<Speaker s09 (f, y)>
<Speaker s10 (m, o)>
No Match
No Match
No Match
No Match
<Speaker s11 (m, y)>
No Match
<Speaker s12 (f, y)>
<Speaker s13 (m, y)>
No Match
No Match
No Match
No Match
No Match
No Match
No Match
No Match
No Match
No Match
No Match
No Match
<Speaker s14 (f, o)>
<Speaker s15 (m, y)>
No Match
<Speaker s16 (f, o)>
No Match
<Speaker s17 (f, o)>
<Speaker s18 (f, o)>
No Match
list index out of range
<Speaker s19 (m, o)>
list index out of range
<Speaker s20 (f, o)>
<Speaker s21 (f, y)>
<Speaker s22 (m, o)>
<Speaker s23 (m, o)>
<Speaker s24 (m, o)>
No Match
No Match
No Match
<Speaker s25 (f, o)>
<Speaker s26 (f, y)>
<Speaker s27 (f, o)>
No Match
No Match
No Match
No Match
<Speaker s28 (m, y)>
list index out of range
list index out of range
list index out of range
<Speaker s29 (m, o)>
l

## Sort through turns

In [7]:
valid_turns = track_segs[
    (track_segs.dur > 16.855625) & ## within anna's range
    (track_segs.dur < 77.089875) & ## within anna's range
    (track_segs.utt > 10) & ## make sure there are enough tokens in the turn
    (track_segs.word_max < 2) ## max word length isn't over 2 sec
]

In [8]:
valid_turns.head()

Unnamed: 0,words,strt,strt_t,end,end_t,utt,dur,word_max,speaker,track,track_id
1,um i'm a um <VOCNOISE> <SIL> it's kind of a un...,40,56.598353,142,84.512061,103,27.913708,1.435416,s01,s0101a,0
2,um <VOCNOISE> working on three different <SIL>...,144,90.45715,214,110.163138,71,19.705988,0.788701,s01,s0101a,0
5,a nurse practitioner has more authority i gues...,270,144.551487,350,168.275562,81,23.724075,0.813652,s01,s0101a,0
8,yes <VOCNOISE> i uh <SIL> um <SIL> uh <VOCNOIS...,441,225.26718,532,247.699562,92,22.432382,0.855013,s01,s0101a,0
9,for me personally <SIL> um <SIL> family being ...,540,266.802112,659,299.150227,120,32.348115,1.640866,s01,s0101a,0


In [9]:
len(valid_turns)

1132

In [10]:
valid_turns.to_pickle('/mnt/cube/j8xing/buckeye_ser/data/interim/candidate_turns.pkl')

## Save Candidates

In [11]:
from tqdm import tqdm
import librosa
import soundfile

In [12]:
for i, row in tqdm(valid_turns.iterrows()):
    if i > 10:
        pass
    speaker = buckeye.Speaker.from_zip(f'/mnt/cube/Datasets/buckeye_zips/{row.speaker}.zip', load_wavs = True)
    speaker[row.track_id].clip_wav(
        f'/mnt/cube/j8xing/buckeye_ser/data/interim/buckeye_candidates/{row.track}_{row.strt}_{row.end}.wav', 
        row.strt_t, 
        row.end_t
    )

1132it [14:50,  1.27it/s]
