# Notebook purpose:
    - read in raw sampling data and preprocess into dataframes with error rates that will be used for analyses

In [None]:
import os, sys
import itertools
import numpy as np
import pandas as pd
import statistics
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from daylongtranscript import*

loaded


## Data Processing

In [None]:
# read in transcripts 
## NOTE: need to create a tab-delimited txt file of the transcripts used to run this section of code (use clean_transcript.py to do this)
transcript_fpath = "<insert transcript file path here>"
A787_files = ["A787_001107_cleaned.txt", "A787_001109_cleaned.txt", "A787_001111_cleaned.txt"]

transcriptA1 = DaylongTranscript(fpath = transcript_fpath+A787_files[0], fname = A787_files[0], isVanDam=False)
transcriptA1.describe()
transcriptA2 = DaylongTranscript(fpath = transcript_fpath+A787_files[1], fname = A787_files[1], isVanDam=False)
transcriptA2.describe()
transcriptA3 = DaylongTranscript(fpath = transcript_fpath+A787_files[2], fname = A787_files[2], isVanDam=False)
transcriptA3.describe()

Transcript:  A787_001107_cleaned.txt ---
	Audio Length:  45371456 ms //  756.191  min
	Total Word Count:  17411
	Intervals of silence: [[2895030, 37024008]]
			 34128978 ms // 568.816 min total silent intervals
			 187.375 min total speaking interval
Transcript:  A787_001109_cleaned.txt ---
	Audio Length:  40999367 ms //  683.323  min
	Total Word Count:  28123
	Intervals of silence: [[10515823, 15926622], [18771272, 33774998]]
			 20414525 ms // 340.242 min total silent intervals
			 343.081 min total speaking interval
Transcript:  A787_001111_cleaned.txt ---
	Audio Length:  42204520 ms //  703.409  min
	Total Word Count:  38531
	Intervals of silence: [[12545915, 14697605], [26935872, 34656022]]
			 9871840 ms // 164.531 min total silent intervals
			 538.878 min total speaking interval


In [None]:
transcript_fpath = "<insert transcript file path here>"
B895_files = ["B895_010002_cleaned.txt", "B895_010004_cleaned.txt"]
transcriptB1 = DaylongTranscript(fpath = transcript_fpath+B895_files[0], fname = B895_files[0], isVanDam=False)
transcriptB1.describe()

transcriptB2 = DaylongTranscript(fpath = transcript_fpath+B895_files[1], fname = B895_files[1], isVanDam=False)
transcriptB2.describe()

Transcript:  B895_010002_cleaned.txt ---
	Audio Length:  44793268 ms //  746.554  min
	Total Word Count:  47192
	Intervals of silence: []
			 0 ms // 0.0 min total silent intervals
			 746.554 min total speaking interval
Transcript:  B895_010004_cleaned.txt ---
	Audio Length:  41213604 ms //  686.893  min
	Total Word Count:  29978
	Intervals of silence: [[13917905, 23559004], [25764448, 28517580]]
			 12394231 ms // 206.571 min total silent intervals
			 480.323 min total speaking interval


In [None]:
transcript_fpath = "<insert transcript file path here>"
fname = "BN32_clean.txt"
transcriptC = DaylongTranscript(fpath = transcript_fpath+fname, fname = fname, isVanDam = True)
transcriptC.describe()

Transcript:  BN32_clean.txt ---
	Audio Length:  50397134 ms //  839.952  min
	Total Word Count:  28345
	Intervals of silence: [[5629249, 11571991], [17158458, 22717854], [27965209, 32094759], [47756032, 50330885]]
			 18206541 ms // 303.442 min total silent intervals
			 536.51 min total speaking interval


In [7]:
transcripts = [transcriptA1, transcriptA2, transcriptA3, transcriptB1, transcriptB2, transcriptC]

In [8]:
TRANSCRIPTA1_TRUEWC = transcriptA1.get_total_word_count()
TRANSCRIPTA2_TRUEWC = transcriptA2.get_total_word_count()
TRANSCRIPTA3_TRUEWC = transcriptA3.get_total_word_count()
TRANSCRIPTB1_TRUEWC = transcriptB1.get_total_word_count()
TRANSCRIPTB2_TRUEWC = transcriptB2.get_total_word_count()
TRANSCRIPTC_TRUEWC = transcriptC.get_total_word_count()
SIM_TYPE = ["30 seconds", "1 minute", "5 minutes", "10 minutes", "30 minutes", "60 minutes"]
TOTAL_TS = ["30", "40", "50", "60", "70", "80", "90", "100", "110", "120"] # in minutes 
TOTAL_TS_3HR = ["130", '140', '150', '160', '170', '180'] # in minutes
PROP_TTS = ["0.05","0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9"] 
TRANSCRIPT_LABELS = ["A1", "A2", "A3", "B1", "B2", "C"]
#ALGORITHMS_VARIATIONS = ["1:Conservative", "2:Overlapping", "3:Overlap + Silence", "4:Sample Half"]

In [9]:
#returns correct variable name given interval length
def get_simulation_type(interval_length):
    if interval_length == 30000.0: return 0
    if interval_length == 60000.0: return 1
    if interval_length == 300000.0: return 2
    if interval_length == 600000.0: return 3
    if interval_length == 1800000.0: return 4
    if interval_length == 3600000.0: return 5 

# process data into a list of lists 
def process_data(data):
    sim_30000 = []   # 30 seconds
    sim_60000 = []  # 1 minute
    sim_300000 = []  # 5 minutes
    sim_600000 = []   # 10 minutes
    sim_1800000 = [] # 30 minutes --> will only have 4 data points (30 min, 60 min, 90 min, 120 min)
    sim_3600000 = []  # 60 minutes --> will only have 2 data points (60 min, 120 min)
    all_simulations = [sim_30000, sim_60000, sim_300000, sim_600000, sim_1800000, sim_3600000]
    for sim_type in all_simulations: sim_type = [] # resest all lists to empty 
    sim_type = None 
    estimates = []
    for index, line in enumerate(data):
        #print(line)
        tokens = line.split() 
        if len(tokens) == 0: pass
        elif tokens[0] == "Transcript:": continue #go to next line 
        elif len(tokens) == 2:
            sim_index = get_simulation_type(float(tokens[0]))
            sim_type = all_simulations[sim_index]
        else: #have line with estimates
            estimates = [float(i) for i in tokens] 
            sim_type.append(estimates)
    
    # sim 30 min & 60 min do not have all data points: rearrange data points so that:
        # sim 30 min = [esimate, Na, Na, estimate, Na, Na, estimate, Na, Na, estimate]
        # sim 60 min = [Na, Na, Na, estimate, Na, Na, Na, Na, Na, estimate]
    none_list = [None] * len(sim_30000)
    temp = [none_list] * 10
    temp[0] = sim_1800000[0]
    temp[3] = sim_1800000[1]
    temp[6] = sim_1800000[2]
    temp[9] = sim_1800000[3]
    all_simulations[4] = temp 
    
    none_list = [None] * len(sim_30000)
    temp = [none_list] * 10
    temp[3] = sim_3600000[0]
    temp[9] = sim_3600000[1]
    all_simulations[5] = temp 
    
    return all_simulations

In [10]:
COLOR_PAL_DICT = {'blue':'#0077BB', 'cyan': '#33BBEE', "teal": '#009988', 'orange':'#EE7733', 'red':'#CC3311', 'magenta':'#EE3377'}
COLOR_PAL_LIST = list(COLOR_PAL_DICT.values())
COLOR_PAL_DEF = ['#EE7733', '#0077BB', '#33BBEE', '#EE3377', '#CC3311', '#009988']

In [11]:
def make_dataframes(data):
    all_df_raw = []
    all_means = []
    all_stdevs = []
    for simtype in data:
        df = simtype_dataframe(simtype)
        means = get_means(df)
        stdevs = get_sd(df)

        all_df_raw.append(df)
        all_means.append(means)
        all_stdevs.append(stdevs)

    all_df_means = stats_df(all_means)
    all_df_sd = stats_df(all_stdevs)
    return all_df_raw, all_df_means, all_df_sd

# input: raw datapoints from simulation type (interval length)
# returns: dataframe; columns: total time sampled 
                    # rows:    data points (ex. daylong estimate)
def simtype_dataframe(simtype):
    #ts = ["30", "40", "50", "60", "70", "80", "90", "100", "110", "120"]
    df = pd.DataFrame(simtype)
    df = df.T 
    df.columns = TOTAL_TS  
    return df 

def convert_to_min(value):
    return (value / (60 * 1000))

def get_means(raw_df):
    means = []
    for col in raw_df.columns:
        means.append(statistics.mean(raw_df[col]))
    return means 

def get_sd(raw_df):
    stdevs = []
    for col in raw_df.columns:
        stdevs.append(statistics.pstdev(raw_df[col]))
    return stdevs 

def stats_df(list):
    df = pd.DataFrame(list)
    df = df.T
    df.columns = SIM_TYPE
    df.index = TOTAL_TS
    return df  

# Analysis: Daylong Estimate Accuracy as Percent Error  

## Preprocessing

In [12]:
def make_percent_error_data(filenamepath, true_WC):
    file = open(filenamepath, 'r')
    data = file.readlines()
    all_simulations = process_data(data)
    all_df_raw, all_df_means, all_df_sd =  make_dataframes(all_simulations)
    return get_percent_error(all_df_raw, true_WC)

def get_percent_error(data_raw, true_WC):
    all_df_perr = []
    perr_means = []
    perr_std = [] 
    for df in data_raw:
        df_err = abs( (df - true_WC ) / true_WC )
        #df_err = (df - true_WC ) / true_WC 
        df_perr = round((df_err * 100), 3) 
        all_df_perr.append(df_perr)
        perr_means.append(df_perr.mean())
        perr_std.append(df_perr.std())
        
    df_perr_means = (pd.DataFrame(perr_means)).T
    df_perr_means.columns = SIM_TYPE
    
    df_perr_std = (pd.DataFrame(perr_std)).T
    df_perr_std.columns = SIM_TYPE
    
    return [all_df_perr, df_perr_means, df_perr_std]

In [None]:
A1_perr_err = make_percent_error_data('Results/TotalTimeSampled_Raw/A787_001107_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTA1_TRUEWC)

A2_perr_err = make_percent_error_data('Results/TotalTimeSampled_Raw/A787_001109_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTA2_TRUEWC)

A3_perr_err = make_percent_error_data('Results/TotalTimeSampled_Raw/A787_001111_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTA3_TRUEWC)

B1_perr_err = make_percent_error_data('Results/TotalTimeSampled_Raw/B895_010002_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTB1_TRUEWC)

B2_perr_err = make_percent_error_data('Results/TotalTimeSampled_Raw/B895_010004_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTB2_TRUEWC)

C_perr_err = make_percent_error_data('Results/TotalTimeSampled_Raw/BN32_clean.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTC_TRUEWC)

In [64]:
perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err, B1_perr_err, B2_perr_err, C_perr_err]

#### Combine results all into one csv 

In [None]:
#make the bigone 
samplingint_dict ={}
combined_dfs = []
for i, sampling_int in enumerate(SIM_TYPE): 
    per_sampling_int_dfs = []
    for t, transcripts in enumerate(perr_err_data_all): 
        df = transcripts[0][i]
        df["Transcript"] = TRANSCRIPT_LABELS[t]
        per_sampling_int_dfs.append(df)
    combined_df = pd.concat(per_sampling_int_dfs, ignore_index=True)
    combined_df["Sampling Interval Size"] = sampling_int
    samplingint_dict[sampling_int] = combined_df
    combined_dfs.append(combined_df)

thebigone = pd.concat(combined_dfs, ignore_index=True)
thebigone.to_csv("/Results/CSVS/allcombos_120min_percenterror.csv", sep = ",", header = True, index = False)

In [18]:
thebigone = pd.read_csv('DaylongSampling/Results/TotalTimeSampled_Raw/thebigone_percenterror.csv', header = 0 )
thebigone

Unnamed: 0,30,40,50,60,70,80,90,100,110,120,Transcript,Sampling Interval Size
0,7.654,8.103,10.352,11.690,10.092,7.457,13.274,1.034,6.773,0.659,A1,30 seconds
1,6.435,3.502,1.979,5.215,5.835,2.686,2.345,9.965,3.715,0.982,A1,30 seconds
2,8.802,1.206,11.385,9.520,3.329,6.381,4.593,4.820,3.324,1.924,A1,30 seconds
3,3.574,18.515,3.507,17.914,1.300,10.174,6.300,0.322,1.973,10.130,A1,30 seconds
4,3.502,1.511,4.047,3.233,6.681,4.968,10.859,13.387,1.533,5.690,A1,30 seconds
...,...,...,...,...,...,...,...,...,...,...,...,...
3595,,,,24.104,,,,,,11.659,C,60 minutes
3596,,,,9.750,,,,,,25.429,C,60 minutes
3597,,,,42.901,,,,,,30.282,C,60 minutes
3598,,,,2.427,,,,,,25.282,C,60 minutes


In [18]:
thebigone[thebigone["Sampling Interval Size"] == "30 seconds" ].describe()

Unnamed: 0,30,40,50,60,70,80,90,100,110,120
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,8.655332,7.399945,6.598867,5.94132,5.4496,5.567018,5.15341,4.93913,4.694667,4.354495
std,7.314723,6.037646,5.243022,5.025619,4.155191,4.392495,3.902589,3.818919,3.915816,3.492593
min,0.022,0.005,0.022,0.002,0.028,0.014,0.001,0.03,0.007,0.006
25%,3.313,2.711,2.66,2.202,2.143,2.21175,2.021,1.9375,1.80725,1.63675
50%,6.6285,5.8805,5.3035,4.839,4.6455,4.444,4.3785,4.1325,3.437,3.6535
75%,11.955,10.1545,9.1725,8.2615,7.719,8.06325,7.3245,7.15875,6.96775,6.32425
max,41.734,32.8,28.407,29.411,21.895,22.743,23.23,20.587,22.891,19.264


#### Process non-overlapping samples data

In [15]:
def make_perr_data_nonoverlapping(filenamepath, true_WC, tts = TOTAL_TS + TOTAL_TS_3HR, prop_tts = False):
    keys = [float(key) for key in tts]
    dict = {}

    file = open(filenamepath, 'r')
    lines = file.readlines()
 
    curr_key = ""
    for line in lines:       
        tokens = line.split()
        if tokens[0] == "Transcript:": ## new TTS key 
            if prop_tts: curr_key = float(tokens[5])
            else: curr_key = int(float(tokens[7]) / 60000)
            continue
        elif len(tokens) == 0: 
            continue
        elif len(tokens) == 2: 
            continue # don't need this line
        else: 
            tokens = [float(token) for token in tokens]
            dict.setdefault(curr_key, []).extend(tokens)
            curr_key = ""
    
    df = pd.DataFrame(dict)
    df_err = abs( (df - true_WC ) / true_WC )
    df_perr = round((df_err * 100), 3) 
    #perrs = get_percent_error(dict, true_WC)
    return df_perr
        

In [None]:
A1_perr_err_nonoverlapping = make_perr_data_nonoverlapping("Results/NonOverlappingIntervals/TTS_Proportion/A787_001107_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                     true_WC=TRANSCRIPTA1_TRUEWC, tts = PROP_TTS, prop_tts=True)
A1_perr_err_nonoverlapping["Transcript"] = "A1"

A2_perr_err_nonoverlapping = make_perr_data_nonoverlapping("Results/NonOverlappingIntervals/TTS_Proportion/A787_001109_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                     true_WC=TRANSCRIPTA2_TRUEWC, tts = PROP_TTS,prop_tts=True)
A2_perr_err_nonoverlapping["Transcript"] = "A2"

A3_perr_err_nonoverlapping = make_perr_data_nonoverlapping("Results/NonOverlappingIntervals/TTS_Proportion/A787_001111_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                     true_WC=TRANSCRIPTA3_TRUEWC, tts = PROP_TTS,prop_tts=True)
A3_perr_err_nonoverlapping["Transcript"] = "A3"

B1_perr_err_nonoverlapping = make_perr_data_nonoverlapping("Results/NonOverlappingIntervals/TTS_Proportion/B895_010002_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                     true_WC=TRANSCRIPTB1_TRUEWC, tts = PROP_TTS,prop_tts=True)
B1_perr_err_nonoverlapping["Transcript"] = "B1"

B2_perr_err_nonoverlapping = make_perr_data_nonoverlapping("Results/NonOverlappingIntervals/TTS_Proportion/B895_010004_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                     true_WC=TRANSCRIPTB2_TRUEWC, tts = PROP_TTS,prop_tts=True)
B2_perr_err_nonoverlapping["Transcript"] = "B2"

C_perr_err_nonoverlapping = make_perr_data_nonoverlapping("Results/NonOverlappingIntervals/TTS_Proportion/BN32_clean.txt_word count_TEST_daylong_estimates.txt",
                                                     true_WC=TRANSCRIPTC_TRUEWC, tts = PROP_TTS,prop_tts=True)
C_perr_err_nonoverlapping["Transcript"] = "C"

df_perr_err_nonoverlapping = pd.concat([A1_perr_err_nonoverlapping, A2_perr_err_nonoverlapping,A3_perr_err_nonoverlapping,
                                        B1_perr_err_nonoverlapping,B2_perr_err_nonoverlapping,C_perr_err_nonoverlapping])

In [17]:
df_perr_err_nonoverlapping

Unnamed: 0,0.05,0.1,0.2,0.3,Transcript,0.4,0.5,0.6
0,18.692,9.256,4.790,35.502,A1,,,
1,15.134,7.947,4.989,37.025,A1,,,
2,0.789,10.182,2.428,35.127,A1,,,
3,1.728,0.008,7.131,34.563,A1,,,
4,0.094,4.873,3.445,36.124,A1,,,
...,...,...,...,...,...,...,...,...
95,1.029,1.067,2.854,0.797,C,0.533,5.378,
96,4.154,10.036,1.247,3.794,C,2.554,3.557,
97,1.495,2.982,6.070,0.301,C,2.256,5.315,
98,17.749,4.724,1.653,2.374,C,0.949,4.170,


In [None]:
### write to file 
df_perr_err_nonoverlapping.to_csv("CSVS/daylongwc_nonoverlapping_samples_proptts.csv", sep = ",", header = True, index = False)

### Analysis: Total Sampled Time

In [20]:
def grp_means_stds(group):
    return group.mean(), group.std()

def indiv_figure(group, title):
    values_means, values_stds = grp_means_stds(group)
    values_stds = group.std()
    fig,ax = plt.subplots()
    ax.errorbar(x=values_means.index, y = values_means, yerr=values_stds)
    plt.xlabel("proportion time sampled (from entire recording)")
    plt.ylabel("mean percent error of estimate")
    plt.title(title)
    filename = title + "_propproptimesampled_V2"
    plt.savefig(filename, dpi=300)
    plt.show()

In [21]:
index = 0 # raw percent error data (all)
sampling_in = 0 # sampling interval = 30 seconds
perr_dfs = [A1_perr_err[index][sampling_in], A2_perr_err[index][sampling_in], A3_perr_err[index][sampling_in], B1_perr_err[index][sampling_in], B2_perr_err[index][sampling_in], C_perr_err[index][sampling_in]]
for i, df in enumerate(perr_dfs):
    df["Transcript"] = TRANSCRIPT_LABELS[i]
combined_perr_df = pd.concat(perr_dfs, ignore_index=True)
combined_perr_df

Unnamed: 0,30,40,50,60,70,80,90,100,110,120,Transcript
0,7.654,8.103,10.352,11.690,10.092,7.457,13.274,1.034,6.773,0.659,A1
1,6.435,3.502,1.979,5.215,5.835,2.686,2.345,9.965,3.715,0.982,A1
2,8.802,1.206,11.385,9.520,3.329,6.381,4.593,4.820,3.324,1.924,A1
3,3.574,18.515,3.507,17.914,1.300,10.174,6.300,0.322,1.973,10.130,A1
4,3.502,1.511,4.047,3.233,6.681,4.968,10.859,13.387,1.533,5.690,A1
...,...,...,...,...,...,...,...,...,...,...,...
595,0.440,7.159,1.870,3.441,4.820,2.640,6.753,1.670,2.590,0.933,C
596,17.790,10.803,4.982,0.282,1.602,6.398,0.334,3.384,0.817,8.551,C
597,9.588,2.826,3.014,7.636,5.185,8.129,1.054,0.961,4.206,1.165,C
598,3.788,7.463,14.703,4.761,4.374,14.091,5.786,12.677,0.059,6.938,C


In [None]:
# ADD 3 HOUR TTS DATA 
def perr_3hr(filenamepath, true_WC, transcript_name):
    file = open(filenamepath, 'r')
    data = file.readlines()
    dict_perr = {}
    for index, line in enumerate(data):
            #print(line)
            tokens = line.split() 
            if len(tokens) == 0: pass
            elif tokens[0] == "Transcript:": # get tts key 
                curr_key = convert_to_min(int(tokens[7]))
                continue #go to next line 
            elif len(tokens) == 2: continue
            else: 
                tokens_raw = [float(token) for token in tokens]
                error = [abs( (token - true_WC ) / true_WC ) for token in tokens_raw]
                percent_error = [round((err * 100), 3) for err in error]
                dict_perr.setdefault(str(int(curr_key)), []).extend(percent_error)
                curr_key = ""

    df = pd.DataFrame(data = dict_perr)
    df["Transcript"] = transcript_name
    return df


A1_perr_err_3hr = perr_3hr('Results/TotalTimeSampled_Raw_3hrs/A787_001107_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTA1_TRUEWC, transcript_name="A1")

A2_perr_err_3hr = perr_3hr('Results/TotalTimeSampled_Raw_3hrs/A787_001109_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTA2_TRUEWC, transcript_name = "A2")

A3_perr_err_3hr = perr_3hr('Results/TotalTimeSampled_Raw_3hrs/A787_001111_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTA3_TRUEWC, transcript_name = "A3")

B1_perr_err_3hr = perr_3hr('Results/TotalTimeSampled_Raw_3hrs/B895_010002_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTB1_TRUEWC, transcript_name = "B1")

B2_perr_err_3hr = perr_3hr('Results/TotalTimeSampled_Raw_3hrs/B895_010004_cleaned.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTB2_TRUEWC, transcript_name = "B2")

C_perr_err_3hr = perr_3hr('Results/TotalTimeSampled_Raw_3hrs/BN32_clean.txt_word count_TEST_daylong_estimates.txt',
                                                            true_WC = TRANSCRIPTC_TRUEWC, transcript_name = "C")


In [24]:
df_3hr = pd.concat([A1_perr_err_3hr,A2_perr_err_3hr,A3_perr_err_3hr,B1_perr_err_3hr,B2_perr_err_3hr,C_perr_err_3hr], ignore_index = True)
df_3hr

Unnamed: 0,130,140,150,160,170,180,Transcript
0,2.685,6.412,3.981,0.547,1.712,4.976,A1
1,1.749,3.935,0.073,2.157,1.592,0.746,A1
2,1.484,8.057,5.459,10.302,0.041,0.925,A1
3,0.466,0.668,3.680,1.018,3.662,8.898,A1
4,4.291,4.582,0.804,0.758,1.244,4.506,A1
...,...,...,...,...,...,...,...
595,8.427,2.292,2.017,4.130,4.470,6.543,C
596,4.351,7.686,4.898,0.392,0.751,6.875,C
597,2.551,1.521,3.876,1.962,1.142,2.568,C
598,2.764,1.413,4.541,1.560,9.592,3.777,C


In [None]:
combined_perr_df2 = pd.concat([combined_perr_df.reset_index(drop=True), df_3hr.drop(columns='Transcript').reset_index(drop=True)], axis=1)
combined_perr_df2.to_csv(path_or_buf='CSVS/PercentError_TTS_180min_30sec.csv', 
                         sep = ",", header = True, index = False)
combined_perr_df2

Unnamed: 0,30,40,50,60,70,80,90,100,110,120,Transcript,130,140,150,160,170,180
0,7.654,8.103,10.352,11.690,10.092,7.457,13.274,1.034,6.773,0.659,A1,2.685,6.412,3.981,0.547,1.712,4.976
1,6.435,3.502,1.979,5.215,5.835,2.686,2.345,9.965,3.715,0.982,A1,1.749,3.935,0.073,2.157,1.592,0.746
2,8.802,1.206,11.385,9.520,3.329,6.381,4.593,4.820,3.324,1.924,A1,1.484,8.057,5.459,10.302,0.041,0.925
3,3.574,18.515,3.507,17.914,1.300,10.174,6.300,0.322,1.973,10.130,A1,0.466,0.668,3.680,1.018,3.662,8.898
4,3.502,1.511,4.047,3.233,6.681,4.968,10.859,13.387,1.533,5.690,A1,4.291,4.582,0.804,0.758,1.244,4.506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.440,7.159,1.870,3.441,4.820,2.640,6.753,1.670,2.590,0.933,C,8.427,2.292,2.017,4.130,4.470,6.543
596,17.790,10.803,4.982,0.282,1.602,6.398,0.334,3.384,0.817,8.551,C,4.351,7.686,4.898,0.392,0.751,6.875
597,9.588,2.826,3.014,7.636,5.185,8.129,1.054,0.961,4.206,1.165,C,2.551,1.521,3.876,1.962,1.142,2.568
598,3.788,7.463,14.703,4.761,4.374,14.091,5.786,12.677,0.059,6.938,C,2.764,1.413,4.541,1.560,9.592,3.777


### Analysis: Sampling Interval Size

In [65]:
def get_across_intsize_results(perr_err_data_all, tts = "120"):
    cross_samplingint_dfs = []
    for i, transcript in enumerate(TRANSCRIPT_LABELS):
        transcript_data = perr_err_data_all[i][0]
        data_dict = {}
        for j, sampling_int in enumerate(SIM_TYPE):
            add = list(transcript_data[j][tts])
            data_dict[sampling_int] = add
        temp_df = pd.DataFrame(data_dict)
        temp_df["Transcript"] = [transcript] * 100
        cross_samplingint_dfs.append(temp_df)

    combined_perr_df_samplingint = pd.concat(cross_samplingint_dfs, ignore_index=True)
    return combined_perr_df_samplingint

In [68]:
combined_samplingint = get_across_intsize_results(perr_err_data_all)
combined_samplingint


Unnamed: 0,30 seconds,1 minute,5 minutes,10 minutes,30 minutes,60 minutes,Transcript
0,0.659,4.695,6.309,6.399,1.126,14.784,A1
1,0.982,4.291,9.044,1.233,14.515,17.268,A1
2,1.924,4.085,3.376,9.762,7.627,9.143,A1
3,10.130,1.448,4.247,8.524,4.390,14.094,A1
4,5.690,6.704,4.946,8.085,10.829,10.201,A1
...,...,...,...,...,...,...,...
595,0.933,11.859,1.149,3.756,0.077,11.659,C
596,8.551,1.686,15.077,3.819,9.493,25.429,C
597,1.165,1.733,24.735,22.301,17.163,30.282,C
598,6.938,4.687,10.192,41.202,13.831,25.282,C


In [None]:
combined_samplingint.to_csv("CSVS/allsamplingints_120min.csv", sep = ",", header = True, index = False)

# Analysis: Proportion Total Time Sampled

## Preprocessing

In [76]:
def process_data_proptts(filenamepath):
    """
    - given lines from raw file, read in line by line and extract appropriate data
    - expected format: 
        \nTranscript: [transcript name] Sampling_interval: [value] Prop_TTS: [value] TTS:[value] Feature: [value]
        \n[sampling interval size] [# of samples]
        \n[estimate value]
    - process as proportion total sampled time
    - returns dict of <tts key> <values>
    """
    file = open(filenamepath, 'r')
    lines = file.readlines()

    keys = [float(key) for key in PROP_TTS]
    dict = {}
    #dict = {key:[] for key in keys}

    curr_key = ""
    for line in lines:       
        tokens = line.split()
        if tokens[0] == "Transcript:": ## new TTS key 
            curr_key = float(tokens[5])
            continue
        elif len(tokens) == 0: 
            continue
        elif len(tokens) == 2: 
            continue # don't need this line
        else: 
            tokens = [float(token) for token in tokens]
            dict.setdefault(curr_key, []).extend(tokens)
            curr_key = ""
    return dict

def get_estimate_accuracy(estimate, groundtruth):
    return round(estimate / groundtruth, 4)
    
def get_estimate_accuracies(raw_data, groundtruth):
    '''
    - given dict of raw values, calculate estimate accuracy for raw values ==> raw estimate/ground truth value
    - return dict of <tts key> <estimate accuracies as a ratio>'''
    keys = [float(key) for key in PROP_TTS]
    dict = {}
    dict = {key: [] for key in keys}

    for key, values in raw_data.items():
        estimate_accuracy = map(get_estimate_accuracy, values, itertools.repeat(groundtruth, len(values)) )
        dict.setdefault(key, []).extend(estimate_accuracy)
    return dict

def get_percent_error(estimate, groundtruth):
    error = abs( ( estimate - groundtruth ) / groundtruth )
    return round( ( error * 100), 3)

def get_percent_errors(raw_data, groundtruth):
    keys = [float(key) for key in PROP_TTS]
    dict = {}
    dict = {key: [] for key in keys}

    for key, values in raw_data.items():
        percent_errors = map(get_percent_error, values,itertools.repeat(groundtruth, len(values)) )
        dict.setdefault(key, []).extend(percent_errors)
    return dict

In [77]:
def get_dataframes_proptts(filepath, groundtruth):
    """
    - wrapper method to get dataframes for means & stds of estimate accuracies and percent error of estimates
    - input: filepath to raw data file & ground truth value for feature
    - returns: 2 dataframes for each 
    """
    raw_data = process_data_proptts(filepath)
    estimate_accuracies_all = get_estimate_accuracies(raw_data, groundtruth)
    percent_errors_all = get_percent_errors(raw_data, groundtruth)
    estimate_accuracies_all = pd.DataFrame(estimate_accuracies_all)
    percent_errors_all = pd.DataFrame(percent_errors_all)
    
    return  estimate_accuracies_all, percent_errors_all

In [None]:
A1_ratio_proptts, A1_perr_proptts = get_dataframes_proptts(filepath = "Results/TotalTimeSampled_ProportionbyTotalAudioTime/A787_001107_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                           groundtruth = TRANSCRIPTA1_TRUEWC)

A2_ratio_proptts, A2_perr_proptts = get_dataframes_proptts(filepath = "Results/TotalTimeSampled_ProportionbyTotalAudioTime/A787_001109_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                           groundtruth = TRANSCRIPTA2_TRUEWC)

A3_ratio_proptts, A3_perr_proptts = get_dataframes_proptts(filepath = "Results/TotalTimeSampled_ProportionbyTotalAudioTime/A787_001111_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                           groundtruth = TRANSCRIPTA3_TRUEWC)

B1_ratio_proptts, B1_perr_proptts = get_dataframes_proptts(filepath = "Results/TotalTimeSampled_ProportionbyTotalAudioTime/B895_010002_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                           groundtruth = TRANSCRIPTB1_TRUEWC)

B2_ratio_proptts, B2_perr_proptts = get_dataframes_proptts(filepath = "Results/TotalTimeSampled_ProportionbyTotalAudioTime/B895_010004_cleaned.txt_word count_TEST_daylong_estimates.txt",
                                                           groundtruth = TRANSCRIPTB2_TRUEWC)

C_ratio_proptts, C_perr_proptts = get_dataframes_proptts(filepath = "Results/TotalTimeSampled_ProportionbyTotalAudioTime/BN32_clean.txt_word count_TEST_daylong_estimates.txt",
                                                           groundtruth = TRANSCRIPTC_TRUEWC)

In [79]:
perr_proptts_all = [A1_perr_proptts,A2_perr_proptts,A3_perr_proptts,B1_perr_proptts,B2_perr_proptts,C_perr_proptts]

In [None]:
for i, df in enumerate(perr_proptts_all):
    df["Transcript"] = TRANSCRIPT_LABELS[i]
combined_perr_proptts_df = pd.concat(perr_proptts_all, ignore_index=True)
combined_perr_proptts_df.to_csv(path_or_buf="CSVS/PercentError_propTTS.csv", 
                                sep = ",", header = True, index = False)
combined_perr_proptts_df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,5.729,7.506,3.977,5.374,3.770,2.790,0.967,3.214,1.860,5.626,A1
1,6.384,10.295,10.872,5.869,9.797,2.607,2.520,4.631,3.715,4.707,A1
2,4.432,6.083,1.695,8.891,0.273,8.650,1.635,6.914,4.240,4.245,A1
3,0.419,0.720,5.428,3.282,2.358,0.302,1.047,5.087,2.210,5.713,A1
4,3.566,2.084,2.378,1.360,1.908,6.251,4.204,3.253,0.218,3.311,A1
...,...,...,...,...,...,...,...,...,...,...,...
595,3.794,3.395,5.112,1.735,3.564,4.756,0.897,4.096,1.386,3.886,C
596,9.727,5.056,1.833,1.885,2.937,2.795,2.543,4.650,2.970,0.007,C
597,4.815,3.546,7.162,1.758,0.832,0.918,0.917,0.632,0.223,0.474,C
598,12.672,1.022,4.984,5.543,2.989,0.447,4.135,1.541,2.033,2.631,C


# Analysis: Select Linguistic Features

### Preprocessing

In [12]:
def get_tts_keys(raw_tts = True):
    if raw_tts is True: return [float(key) for key in TOTAL_TS + TOTAL_TS_3HR]
    else: return [float(key) for key in PROP_TTS]

def process_data_selectfeature(filenamepath, raw_tts = True):
    """
    - given lines from raw file, read in line by line and extract appropriate data
    - expected format: 
        \nTranscript: [transcript name] Sampling_interval: [value] Prop_TTS: [value] TTS:[value] Feature: [value]
        \n[sampling interval size] [# of samples]
        \n[estimate value]
    - returns dict of <tts key> <values>
    """
    file = open(filenamepath, 'r')
    lines = file.readlines()

    keys = get_tts_keys(raw_tts = raw_tts)
    dict = {}
    #dict = {key:[] for key in keys}

    curr_key = ""
    for line in lines:       
        tokens = line.split()
        if tokens[0] == "Transcript:": ## new TTS key 
            if raw_tts is True: curr_key = round( (float(tokens[7]) / (60 *1000)), 3)
            else: curr_key = float(tokens[5])
            continue
        elif len(tokens) == 0: 
            continue
        elif len(tokens) == 2: 
            continue # don't need this line
        else: 
            tokens = [float(token) for token in tokens]
            dict.setdefault(curr_key, []).extend(tokens)
            curr_key = ""
    return dict

def get_estimate_accuracy(estimate, groundtruth):
    return round(estimate / groundtruth, 4)
    
def get_estimate_accuracies(raw_data, groundtruth, raw_tts = True):
    '''
    - given dict of raw values, calculate estimate accuracy for raw values ==> raw estimate/ground truth value
    - return dict of <tts key> <estimate accuracies as a ratio>'''
    keys = get_tts_keys(raw_tts = raw_tts)
    dict = {}
    dict = {key: [] for key in keys}

    for key, values in raw_data.items():
        estimate_accuracy = map(get_estimate_accuracy, values, itertools.repeat(groundtruth, len(values)) )
        dict.setdefault(key, []).extend(estimate_accuracy)
    return dict

def get_percent_error(estimate, groundtruth):
    error = abs( ( estimate - groundtruth ) / groundtruth )
    return round( ( error * 100), 3)

def get_percent_errors(raw_data, groundtruth, raw_tts = True):
    keys = get_tts_keys(raw_tts=raw_tts)
    dict = {}
    dict = {key: [] for key in keys}

    for key, values in raw_data.items():
        percent_errors = map(get_percent_error, values,itertools.repeat(groundtruth, len(values)) )
        dict.setdefault(key, []).extend(percent_errors)
    return dict

def make_df_select_feature(filenamepath, transcript, feature_dict, raw_tts = True, return_perr_only = True):
    groundtruth = transcript.feature_count(feature_dict = feature_dict)
    raw_data = process_data_selectfeature(filenamepath, raw_tts = raw_tts)
    estimate_accuracies_all = get_estimate_accuracies(raw_data, groundtruth, raw_tts = raw_tts)
    percent_errors_all = get_percent_errors(raw_data, groundtruth, raw_tts=raw_tts)
    estimate_accuracies_all = pd.DataFrame(estimate_accuracies_all)
    percent_errors_all = pd.DataFrame(percent_errors_all)
    
    if return_perr_only is True: return  percent_errors_all
    else: return percent_errors_all, estimate_accuracies_all

def make_perr_csv(perr_data, write = False, filepath = "", transcript_labels = None):
    if transcript_labels is None: transcript_labels = TRANSCRIPT_LABELS
    for i, df in enumerate(perr_data):
        df["Transcript"] = transcript_labels[i]
    combined_df = pd.concat(perr_data, ignore_index=True)
    
    if write is True:
        combined_df.to_csv(filepath, sep = ",", header = True, index = False)
    
    return combined_df


## Prop TTS

### Eat

In [None]:
select_word = {"select_word": "eat"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Select_Word/A787_001107_cleaned.txt_select_word_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_word, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Select_Word/A787_001109_cleaned.txt_select_word_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_word,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Select_Word/A787_001111_cleaned.txt_select_word_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_word,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Select_Word/B895_010002_cleaned.txt_select_word_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB1, feature_dict = select_word,raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Select_Word/B895_010004_cleaned.txt_select_word_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_word,raw_tts = False)

C_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Select_Word/BN32_clean.txt_select_word_TEST_daylong_estimates.txt',
                                                            transcript=transcriptC, feature_dict = select_word,raw_tts = False)

perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err, B1_perr_err, B2_perr_err, C_perr_err]

In [None]:
make_perr_csv(perr_data=perr_err_data_all, write = False, filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_selectword.csv")

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,57.830,10.723,5.447,1.936,12.021,3.000,13.021,22.021,35.106,9.532,A1
1,58.170,15.979,71.340,31.468,5.447,3.000,24.766,12.213,18.617,10.383,A1
2,47.277,42.340,34.447,19.511,1.149,14.936,6.319,9.617,14.000,3.340,A1
3,68.702,31.468,10.383,51.128,2.468,15.979,14.234,9.213,23.894,3.106,A1
4,68.362,10.383,42.340,7.191,25.213,13.872,19.511,15.979,8.404,6.851,A1
...,...,...,...,...,...,...,...,...,...,...,...
595,100.000,63.769,31.026,20.103,22.846,5.000,17.385,15.769,5.821,38.308,C
596,1.744,18.103,1.744,34.487,22.205,53.949,23.564,2.949,0.308,11.000,C
597,31.026,34.487,22.846,14.641,31.026,8.103,34.487,22.795,14.641,11.000,C
598,100.000,80.154,14.641,34.487,84.256,1.538,7.179,28.692,12.590,9.179,C


### Imit

In [None]:
select_feat = {"utterance_annotation": "imit"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Imit/A787_001107_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Imit/A787_001109_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Imit/A787_001111_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Imit/B895_010002_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB1, feature_dict = select_feat,raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Imit/B895_010004_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)


perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err, B1_perr_err, B2_perr_err]

In [None]:
make_perr_csv(perr_data=perr_err_data_all, write = False, 
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_imit.csv")

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,5.607,21.869,2.508,3.869,13.689,16.328,1.164,11.213,23.902,19.607,A1
1,13.738,13.738,61.410,39.459,25.869,13.738,28.639,21.082,4.607,4.262,A1
2,51.262,38.115,19.836,30.951,17.803,15.508,13.344,5.607,11.197,6.967,A1
3,2.508,9.672,8.607,9.672,9.672,3.328,3.590,16.066,4.607,8.328,A1
4,10.639,5.607,24.852,20.115,16.787,16.180,2.230,7.934,21.295,2.066,A1
...,...,...,...,...,...,...,...,...,...,...,...
495,69.264,23.066,11.516,30.758,28.835,13.824,14.121,4.418,28.835,20.538,B2
496,53.879,23.099,26.912,12.802,11.560,16.901,5.110,15.407,8.681,3.396,B2
497,84.659,38.451,34.648,2.538,15.363,7.670,7.670,17.604,23.066,3.440,B2
498,69.220,30.791,19.253,20.538,23.099,3.055,37.209,14.308,34.648,16.264,B2


### Questions

In [None]:
select_feat = {"utterance_annotation": "?"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Questions/A787_001107_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat,raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Questions/A787_001109_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Questions/A787_001111_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Questions/B895_010002_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB1, feature_dict = select_feat,raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Questions/B895_010004_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)


perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err, B1_perr_err, B2_perr_err]

In [None]:
make_perr_csv(perr_data=perr_err_data_all, write = False, 
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_questions.csv")

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,19.585,7.464,7.599,4.951,9.080,3.533,6.432,3.309,3.458,7.793,A1
1,1.962,0.077,4.233,9.978,11.572,3.533,0.347,5.502,6.522,6.267,A1
2,5.733,4.233,13.121,4.951,10.090,4.933,4.322,10.042,5.579,4.890,A1
3,17.429,0.462,1.020,14.018,5.242,5.363,8.541,12.389,7.161,8.452,A1
4,8.273,0.885,2.212,4.591,10.764,7.034,5.893,8.003,7.801,2.855,A1
...,...,...,...,...,...,...,...,...,...,...,...
495,10.199,12.633,18.340,8.066,7.686,3.348,11.745,7.523,2.739,14.830,B2
496,20.853,1.216,8.827,9.080,5.823,11.414,9.818,14.480,10.349,8.574,B2
497,20.242,0.305,3.880,6.543,9.398,16.133,5.149,7.197,12.252,11.617,B2
498,14.765,8.066,6.543,16.945,1.216,5.630,2.865,7.414,0.551,7.051,B2


### CDS

In [None]:
select_feat = {"xds":"T"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_CDS/A787_001107_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_CDS/A787_001109_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_CDS/A787_001111_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_CDS/B895_010002_cleaned.txt_xds_TEST_daylong_estimates.txt',       
                                                            transcript=transcriptB1, feature_dict = select_feat,raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_CDS/B895_010004_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)

C_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_CDS/BN32_clean.txt_xds_TEST_daylong_estimates.txt',
                                                          transcript=transcriptC, feature_dict = {"xds":"id"},raw_tts = False)

perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err, B1_perr_err, B2_perr_err, C_perr_err]

In [None]:
make_perr_csv(perr_data=perr_err_data_all, write = False, 
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_cds.csv")

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,4.515,12.395,3.996,9.276,2.475,6.961,9.079,3.908,1.676,5.180,A1
1,3.613,12.846,3.247,11.662,1.809,7.006,5.013,0.043,2.263,0.363,A1
2,21.414,3.951,5.096,5.887,15.507,3.715,8.205,5.932,8.217,3.320,A1
3,13.421,7.390,2.193,5.379,1.908,0.828,0.145,10.591,3.130,2.500,A1
4,3.951,0.107,11.211,9.746,3.165,0.378,1.763,4.673,0.957,1.540,A1
...,...,...,...,...,...,...,...,...,...,...,...
595,18.082,2.154,8.967,8.196,1.507,2.423,5.016,0.729,5.581,2.992,C
596,0.140,0.232,4.305,1.849,6.633,0.116,7.762,4.567,0.541,5.291,C
597,3.436,13.779,14.734,7.921,6.685,3.802,7.311,4.063,1.433,4.443,C
598,2.154,20.187,10.615,0.568,3.527,0.305,2.398,1.631,1.839,5.053,C


### Speaker: FA1

In [None]:
select_feat = {"speaker":"FA1"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FA1/A787_001107_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FA1/A787_001109_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FA1/A787_001111_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FA1/B895_010002_cleaned.txt_speaker_TEST_daylong_estimates.txt',       
                                                           transcript=transcriptB1, feature_dict = {"speaker": "MA1"},raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FA1/B895_010004_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)

C_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FA1/BN32_clean.txt_speaker_TEST_daylong_estimates.txt',
                                                          transcript=transcriptC, feature_dict = select_feat,raw_tts = False)

perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err,  B1_perr_err,B2_perr_err, C_perr_err]

In [None]:
transcript_labels = ["A1", "A2", "A3", "B1","B2", 'C']
df = make_perr_csv(perr_data=perr_err_data_all, write = True, transcript_labels=transcript_labels,
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_sp_FA1.csv")
df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,0.297,3.960,4.856,6.099,8.805,9.431,4.680,5.729,2.304,4.692,A1
1,2.112,7.493,0.891,0.430,2.755,1.237,3.902,4.290,4.235,6.826,A1
2,1.172,18.870,7.212,8.505,5.564,8.839,8.660,2.645,3.155,7.835,A1
3,6.618,13.316,3.214,5.400,3.252,0.053,0.178,7.037,9.929,5.912,A1
4,2.458,0.384,5.526,14.029,2.534,0.487,1.774,7.343,9.991,4.478,A1
...,...,...,...,...,...,...,...,...,...,...,...
595,10.491,8.267,5.434,0.211,3.012,1.545,2.400,2.172,3.768,1.385,C
596,4.138,14.577,1.936,6.685,3.502,0.945,5.169,3.335,2.309,0.278,C
597,2.507,6.550,3.537,3.480,1.206,0.506,3.580,3.826,2.080,1.761,C
598,3.452,0.541,0.983,5.569,4.493,5.452,3.974,4.318,0.304,4.785,C


### XDS: C

In [None]:
select_feat = {"xds":"C"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_C/A787_001107_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_C/A787_001109_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_C/A787_001111_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)


B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_C/B895_010004_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)


perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err,  B2_perr_err]

In [None]:
transcript_labels = ["A1", "A2", "A3", "B2"]
df = make_perr_csv(perr_data=perr_err_data_all, write = True, transcript_labels=transcript_labels,
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_xds_C.csv")
df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,7.661,7.243,2.149,5.126,12.755,0.559,0.036,0.538,1.308,1.430,A1
1,15.924,0.439,7.396,6.641,0.756,3.459,1.052,7.583,6.431,1.613,A1
2,7.123,9.696,2.072,4.549,9.864,3.501,0.817,11.687,7.862,2.313,A1
3,21.762,4.895,2.494,5.510,4.861,5.184,3.743,0.494,1.630,7.260,A1
4,15.924,14.579,5.356,6.103,1.957,5.668,6.289,0.637,4.516,2.906,A1
...,...,...,...,...,...,...,...,...,...,...,...
395,13.370,23.671,7.115,19.182,4.741,2.232,0.088,17.017,7.407,9.360,B2
396,67.249,25.998,15.194,1.222,2.005,8.263,6.507,6.034,23.859,10.221,B2
397,35.457,0.941,1.304,10.190,0.672,14.523,13.230,11.125,5.478,8.631,B2
398,27.963,9.722,4.250,14.119,10.213,3.074,9.687,13.850,4.905,5.762,B2


### Utterance Annotation: !

In [None]:
select_feat = {"utterance_annotation":"!"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Exclamation/A787_001107_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Exclamation/A787_001109_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Exclamation/A787_001111_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Exclamation/B895_010002_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',       
                                                            transcript=transcriptB1, feature_dict = select_feat,raw_tts = False)


perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err,  B1_perr_err]

In [None]:
transcript_labels = ["A1", "A2", "A3", "B1"]
df = make_perr_csv(perr_data=perr_err_data_all, write = True, transcript_labels=transcript_labels,
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_exclamation.csv")
df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,2.738,7.914,7.221,2.512,9.652,2.079,5.368,6.923,18.742,14.142,A1
1,17.559,24.125,12.084,4.364,9.421,5.970,2.665,1.299,5.426,0.009,A1
2,1.893,9.768,10.843,5.445,10.114,8.378,14.708,2.953,7.163,4.931,A1
3,36.168,0.505,2.357,2.275,17.178,5.877,1.893,12.877,6.525,3.798,A1
4,4.591,14.316,7.221,5.753,11.736,9.490,5.830,7.716,8.436,6.834,A1
...,...,...,...,...,...,...,...,...,...,...,...
395,22.971,6.465,18.294,7.840,0.069,2.889,9.468,1.100,0.499,1.329,B1
396,11.967,14.168,16.231,9.445,2.510,23.109,2.682,5.718,5.261,2.171,B1
397,54.058,4.539,1.651,9.812,7.806,9.078,2.453,9.649,0.963,7.367,B1
398,3.164,14.168,13.136,10.270,6.293,1.926,3.737,3.556,3.765,4.402,B1


### XDS: A


In [None]:
select_feat = {"xds":"A"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_A/A787_001107_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_A/A787_001109_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_A/A787_001111_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_A/B895_010002_cleaned.txt_xds_TEST_daylong_estimates.txt',       
                                                            transcript=transcriptB1, feature_dict = select_feat,raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_A/B895_010004_cleaned.txt_xds_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)

select_feat = {"xds":"od"}
C_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_xds_A/BN32_clean.txt_xds_TEST_daylong_estimates.txt',
                                                          transcript=transcriptC, feature_dict = select_feat,raw_tts = False)

perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err,  B1_perr_err,B2_perr_err,C_perr_err]

In [None]:
df = make_perr_csv(perr_data=perr_err_data_all, write = True, 
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_xds_A.csv")
df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,10.724,19.387,23.870,28.145,3.711,9.241,2.868,6.099,3.025,12.706,A1
1,7.655,16.126,11.931,18.460,5.890,3.689,15.215,7.544,2.785,2.030,A1
2,18.044,1.422,1.910,13.010,5.962,9.385,9.150,11.695,6.549,8.082,A1
3,10.819,4.826,6.552,2.398,10.553,10.814,4.451,5.290,3.552,2.329,A1
4,1.997,7.496,1.047,21.464,1.530,6.729,0.032,4.715,11.655,10.490,A1
...,...,...,...,...,...,...,...,...,...,...,...
595,4.569,16.025,6.590,1.546,0.321,5.764,2.245,5.990,6.355,1.741,C
596,7.032,10.361,1.860,10.213,3.450,3.453,0.736,1.482,6.124,2.651,C
597,0.198,11.777,0.495,1.700,4.158,2.092,0.921,6.219,3.850,3.383,C
598,10.884,1.152,11.450,1.183,0.841,2.326,4.262,4.103,0.406,3.660,C


### Speaker: CHI

In [None]:
select_feat = {"speaker":"CHI"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_CHI/A787_001107_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

A2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_CHI/A787_001109_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA2, feature_dict = select_feat,raw_tts = False)

A3_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_CHI/A787_001111_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA3, feature_dict = select_feat,raw_tts = False)

B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_CHI/B895_010002_cleaned.txt_speaker_TEST_daylong_estimates.txt',       
                                                            transcript=transcriptB1, feature_dict = select_feat,raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_CHI/B895_010004_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)


perr_err_data_all = [A1_perr_err, A2_perr_err, A3_perr_err,  B1_perr_err,B2_perr_err]

In [None]:
labels = ["A1", "A2", "A3", "B1", "B2"]
df = make_perr_csv(perr_data=perr_err_data_all, write = True, 
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_sp_CHI.csv")
df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,27.304,13.475,2.297,13.411,15.890,9.269,13.570,7.710,6.963,12.317,A1
1,10.963,18.400,12.811,10.443,12.054,7.602,9.180,12.148,8.407,9.770,A1
2,9.118,9.307,12.148,6.654,10.775,8.284,10.065,8.252,4.878,11.327,A1
3,5.707,9.448,10.538,15.558,10.159,5.556,13.664,8.901,8.359,6.570,A1
4,17.453,23.845,11.296,6.781,13.333,8.587,12.243,10.281,15.298,12.128,A1
...,...,...,...,...,...,...,...,...,...,...,...
495,3.143,9.996,3.143,1.016,2.017,4.515,0.891,4.323,3.708,6.868,B2
496,4.365,0.891,12.061,6.743,2.863,2.167,0.453,3.185,0.986,4.240,B2
497,17.129,3.989,13.187,0.891,16.378,5.095,9.495,7.636,0.564,5.992,B2
498,12.904,5.116,10.935,3.769,3.802,2.863,2.080,11.498,8.213,0.360,B2


### Speaker: FC1

In [None]:
select_feat = {"speaker":"FC1"}
# read in files
A1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FC1/A787_001107_cleaned.txt_speaker_TEST_daylong_estimates.txt',
                                                            transcript=transcriptA1, feature_dict = select_feat, raw_tts = False)

C_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_Sp_FC1/BN32_clean.txt_speaker_TEST_daylong_estimates.txt',
                                                          transcript=transcriptC, feature_dict = select_feat,raw_tts = False)

perr_err_data_all = [A1_perr_err, C_perr_err]

In [None]:
labels = ["A1", "C"]
df = make_perr_csv(perr_data=perr_err_data_all, write = True, transcript_labels=labels,
              filepath = "CSVS/SelectFeatures/Prop_TTS/percenterror_sp_FC1.csv")
df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,3.501,12.814,1.659,3.048,3.100,0.324,2.053,10.906,4.586,0.057,A1
1,52.970,14.480,10.490,4.465,5.430,6.588,0.738,3.617,5.907,3.773,A1
2,2.799,8.604,1.395,4.144,7.206,2.676,1.293,5.183,1.176,0.431,A1
3,5.972,2.639,9.349,1.483,13.390,5.079,8.822,1.962,5.847,4.004,A1
4,22.797,9.218,10.386,3.851,9.108,3.904,0.476,0.130,3.533,2.741,A1
...,...,...,...,...,...,...,...,...,...,...,...
195,31.880,10.999,0.430,5.431,2.207,1.090,1.584,1.485,14.003,3.257,C
196,1.090,4.607,15.377,1.841,1.475,9.794,1.694,9.241,9.048,4.485,C
197,25.708,14.058,8.911,5.724,0.339,1.530,1.364,0.085,4.579,3.086,C
198,15.615,9.021,15.377,8.655,7.135,1.767,0.101,5.423,5.641,1.578,C


### Utterance annotation: < >

In [None]:
select_feat = {"utterance_annotation":"<"}
B1_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_UttAnnotation/B895_010002_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',       
                                                            transcript=transcriptB1, feature_dict = select_feat,raw_tts = False)

B2_perr_err = make_df_select_feature('Results/Select_Features/TTS_Prop/TTS_Prop_UttAnnotation/B895_010004_cleaned.txt_utterance_annotation_TEST_daylong_estimates.txt',
                                                            transcript=transcriptB2, feature_dict = select_feat,raw_tts = False)

In [None]:
labels = ["B1", "B2"]
df = make_perr_csv(perr_data=perr_err_data_all, write = True, transcript_labels=labels,
              filepath = "Results/CSVS/SelectFeatures/Prop_TTS/percenterror_uttannotation.csv")
df

Unnamed: 0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,Transcript
0,3.501,12.814,1.659,3.048,3.100,0.324,2.053,10.906,4.586,0.057,B1
1,52.970,14.480,10.490,4.465,5.430,6.588,0.738,3.617,5.907,3.773,B1
2,2.799,8.604,1.395,4.144,7.206,2.676,1.293,5.183,1.176,0.431,B1
3,5.972,2.639,9.349,1.483,13.390,5.079,8.822,1.962,5.847,4.004,B1
4,22.797,9.218,10.386,3.851,9.108,3.904,0.476,0.130,3.533,2.741,B1
...,...,...,...,...,...,...,...,...,...,...,...
195,31.880,10.999,0.430,5.431,2.207,1.090,1.584,1.485,14.003,3.257,B2
196,1.090,4.607,15.377,1.841,1.475,9.794,1.694,9.241,9.048,4.485,B2
197,25.708,14.058,8.911,5.724,0.339,1.530,1.364,0.085,4.579,3.086,B2
198,15.615,9.021,15.377,8.655,7.135,1.767,0.101,5.423,5.641,1.578,B2
