In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statistics
from scipy.stats import variation
from scipy.stats import mannwhitneyu

In [None]:
# Load ATAS multiple file analysis output files (Outputs from - Streamlined_ATAS_AWS.ipynb/ Streamlined_ATAS_AWNS.ipynb)
df_1 = pd.read_csv('.../Stat_csv_files/AWNS_All_files_together_50_ms_1_win_50_150_csv.csv') #Change path as required
df_2 = pd.read_csv('.../Stat_csv_files/AWS_All_files_together_50_ms_1_win_50_150_csv.csv') #Change path as required

ssi_scores = pd.read_csv('.../Stat_csv_files/AWS_SSI_details.csv') # Change path as required # Use if applicable


# CSV files with Participant details
AWNS_par = pd.read_csv('.../Stat_csv_files/AWNS_details.csv') #Change path as required
AWS_par = pd.read_csv('.../Stat_csv_files/AWS_details.csv') #Change path as required

In [None]:
individual_csv_files_path =  '../ATAS_Multiple_Files_Analysis/Individual_OutputCSV_Files/' #Change path as required

In [None]:
#columns_to_remove = ['Long Pauses', 'Short Pauses'] # if present
df_1_cleaned = df_1.drop(columns=columns_to_remove, errors='ignore')
df_2_cleaned = df_2.drop(columns=columns_to_remove, errors='ignore')
df_all = pd.concat([df_1_cleaned, df_2_cleaned], axis=0, ignore_index=True)

In [None]:
%run -i "..../Stat_csv_files/Long_short_pause_compute_functions.ipynb"  #Change path as required

In [None]:
# Merge participant details
df_all_1 = merge_dataframes_on_filename(df_all, AWNS_par, AWS_par)

# Calculate long and short pause metrics

pause_threshold = 0.15 # the threshold for the long and short pause categorization in sec 
# pause event >= pause_threshold - long pause 
# pause event < pause_threshold - short pause 

for i, row in df_all_1.iterrows():
    filename = row[0]
    csv_filename_1 = filename.split('.wav')[0] + '_f.csv'
    csv_filename = individual_csv_files_path + csv_filename_1
    process_pause_durations(csv_filename, df_all_1, i, pause_threshold)

# Calculate the speech rate
calculate_speech_rate(df_all_1)


In [None]:
# Remove if not applicable

# Merge SSI scores
df_all_1 = merge_ssi_scores(df_all_1, ssi_scores) # Remove if not applicable

In [None]:
# Export final csv
output_csv_path = '.../Stat_csv_files/AWNS_AWS_all_details.csv' # Specify your output path
df_all_1.to_csv(output_csv_path, index=False)  # Save the DataFrame to a CSV file

In [None]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # Prevent wrapping of columns
pd.set_option('display.max_rows', None)  # Optional: show all rows if needed

In [None]:
# df_all_1

In [None]:
df_all_1.columns

Index(['File_Name', 'Group', 'Age', 'Sex', 'Total_words_expected',
       'Words_missing_at_end', 'Final_word_count', 'Speech Time Threshold_ms',
       'Pause Time Threshold_ms', 'Percent_Pause', 'Percent_Speech',
       'Total_Duration_Unclipped_s', 'Total_Duration_Clipped_s',
       'Speech_Duration_s', 'Pause_Duration_s', 'Speech_Events',
       'Pause_Events', 'Mean Speech_s', 'Std Dev Speech', 'CV Speech',
       'Mean Pause_s', 'Std Dev Pause', 'CV Pause', 'All_events_durations',
       'long_p_durations', 'short_p_durations', 'long_p_count',
       'short_p_count', 'long_p_durations_mean', 'short_p_durations_mean',
       'long_p_durations_cv', 'short_p_durations_cv', 'Event_type',
       'Speech_Rate', 'ID', 'No_of_stuttered_syllables',
       'No_of_total_syllables', 'Percent_syllables_stuttered', 'Score'],
      dtype='object')

In [None]:
# Description of these column parameters:
# 'File_Name' - name of the file e.g. 'C1_DS.wav'
# 'Group' - Category e.g. AWS and AWNS, CWS and CWNS
# 'Age' - Age - unit as you input 
# 'Sex' - as you input e.g. F or M
# 'Total_words_expected' - total number of words in the passage to be read
# 'Words_missing_at_end' - incase if any less words were read 
# 'Final_word_count' - total number of words read present in the acoustic file
# 'Speech Time Threshold_ms' - minimum speech event selection threshold (temporal)
# 'Pause Time Threshold_ms' - minimum pause event selection threshold (temporal)
# 'Percent_Pause' - percent of (total duration of all pause events in file/ total duration of file)
# 'Percent_Speech' - percent of (total duration of all speech events in file/ total duration of file)
# 'Total_Duration_Unclipped_s' - Total original file duration
# 'Total_Duration_Clipped_s' - Total final file duration (for analysis) - based on the start and stop time input from the csv file
# 'Speech_Duration_s' - total duration of all speech events in file
# 'Pause_Duration_s' - total duration of all pause events in file
# 'Speech_Events'- speech events total count
# 'Pause_Events' - pause events total counts
# 'Mean Speech_s' - mean duration metric of all the speech event durations
# 'Std Dev Speech' - standard deviation metric of all the speech event durations
# 'CV Speech' - covariate of variation metric of all the speech event durations 
# 'Mean Pause_s' - mean duration metric of all the pause event durations
# 'Std Dev Pause' - standard deviation metric of all the pause event durations
# 'CV Pause' - covariate of variation metric of all the pause event durations 
# 'All_events_durations' - all the individual event durations
# 'long_p_durations' - all the individual event (long pause) durations
# 'short_p_durations'- all the individual event (short pause) durations
# 'long_p_count' - long pause events total counts
# 'short_p_count' - short pause events total counts
# 'long_p_durations_mean' - mean duration metric of all the long pause event durations
# 'short_p_durations_mean' - mean duration metric of all the short pause event durations
# 'long_p_durations_cv' - covariate of variation metric of all the long pause event durations 
# 'short_p_durations_cv'- covariate of variation metric of all the short pause event durations
# 'Speech_Rate'- speech rate (total words read/ total file time)
# 'ID' - redundant - file name present for the files for which %SS scores and other clinical assessment related scores are available
# 'No_of_stuttered_syllables' - no of stuttered syllables in the speech file
# 'No_of_total_syllables' - no of total syllables in the speech file
# 'Percent_syllables_stuttered' - (No_of_stuttered_syllables/No_of_total_syllables)
# 'Score' - SSI-4 score