In [None]:
def merge_dataframes_on_filename(df_all, AWNS_par, AWS_par):
    """
    Merge df_all with either AWNS_par or AWS_par based on matching file names, 
    adding columns in the same order as in AWNS_par or AWS_par, and inserting them right after 'File_Name'.

    Parameters:
    df_all (DataFrame): The main dataframe containing filenames with '.wav' extension.
    AWNS_par (DataFrame): DataFrame containing the 'ID' column without the '.wav' extension.
    AWS_par (DataFrame): Another DataFrame containing the 'ID' column without the '.wav' extension.

    Returns:
    DataFrame: Updated df_all with matched rows from either AWNS_par or AWS_par.
    """

    df_all_copy = df_all.copy()

    # Loop through each row in df_all_copy
    for i, row_1 in df_all_copy.iterrows():
        filename_1 = row_1['File_Name']  
        filename_no_ext = filename_1.rsplit('.', 1)[0]  # Remove the '.wav' extension
        
        # Check if the filename matches an entry in AWNS_par or AWS_par
        matching_row_AWNS = AWNS_par[AWNS_par['ID'] == filename_no_ext]
        matching_row_AWS = AWS_par[AWS_par['ID'] == filename_no_ext]
        
        idx_after_first_col = df_all_copy.columns.get_loc('File_Name') + 1  # Insert right after 'File_Name'

        # If a match is found in AWNS_par
        if not matching_row_AWNS.empty:
            # Add columns in the same order as they appear in AWNS_par
            for col in matching_row_AWNS.columns:
                if col != 'ID':  # Skip 'ID' column
                    if col not in df_all_copy.columns:
                        # Insert the new column right after 'File_Name'
                        df_all_copy.insert(idx_after_first_col, col, pd.NA)
                        idx_after_first_col += 1  # Move to the next position
                    df_all_copy.loc[i, col] = matching_row_AWNS[col].values[0]

        # If a match is found in AWS_par
        elif not matching_row_AWS.empty:
            # Add columns in the same order as they appear in AWS_par
            for col in matching_row_AWS.columns:
                if col != 'ID':  # Skip 'ID' column
                    if col not in df_all_copy.columns:
                        # Insert the new column right after 'File_Name'
                        df_all_copy.insert(idx_after_first_col, col, pd.NA)
                        idx_after_first_col += 1  # Move to the next position
                    df_all_copy.loc[i, col] = matching_row_AWS[col].values[0]

    return df_all_copy

# Define the function to process pauses and update the DataFrame
def process_pause_durations(file_need, df, i, pause_threshold):
    # Check if columns exist, if not, create them

    # Add all event durations sequence-wise
    if 'All_events_durations' not in df.columns:
        df['All_events_durations'] = None
    
    if 'long_p_durations' not in df.columns:
        df['long_p_durations'] = None
    if 'short_p_durations' not in df.columns:
        df['short_p_durations'] = None
    if 'long_p_count' not in df.columns:
        df['long_p_count'] = None
    if 'short_p_count' not in df.columns:
        df['short_p_count'] = None
        
    if 'long_p_durations_mean' not in df.columns:
        df['long_p_durations_mean'] = None
    if 'short_p_durations_mean' not in df.columns:
        df['short_p_durations_mean'] = None
    if 'long_p_durations_cv' not in df.columns:
        df['long_p_durations_cv'] = None
    if 'short_p_durations_cv' not in df.columns:
        df['short_p_durations_cv'] = None
    

    # Load the corresponding CSV file
    ddf = pd.read_csv(file_need)


    # Identify long and short pauses
    long_p = [idx for idx in ddf.index if (ddf["Labels"][idx] == 'p') & (ddf["Time_diff"][idx] >= pause_threshold)]
    short_p = [idx for idx in ddf.index if (ddf["Labels"][idx] == 'p') & (ddf["Time_diff"][idx] < pause_threshold)]
    

    # Calculate long pause durations, means, and CV
    long_p_dur = [ddf["Time_diff"][idx] for idx in long_p]
    if long_p_dur:  
        long_p_mean = statistics.mean(long_p_dur)
        long_p_cv = variation(long_p_dur)
    else:
        long_p_mean = 0
        long_p_cv = 0

    # Calculate short pause durations, means, and CV
    short_p_dur = [ddf["Time_diff"][idx] for idx in short_p]
    if short_p_dur: 
        short_p_mean = statistics.mean(short_p_dur)
        short_p_cv = variation(short_p_dur)
    else:
        short_p_mean = 0
        short_p_cv = 0
        
    all_events_dur = ddf["Time_diff"].values

    pause_type = np.where(
        (ddf['Labels'] == 'p') & (ddf['Time_diff'] >= pause_threshold), 2,   # First condition
        np.where(
            (ddf['Labels'] == 'p') & (ddf['Time_diff'] < pause_threshold), 1,  # Second condition
            0  
        )
    )

# Array corresponding to the conditions.


    # Update the corresponding row in df with the processed pause data
    df.at[i, 'All_events_durations'] = ','.join(map(str, all_events_dur))  # Convert to string
    df.at[i, 'long_p_durations'] = ','.join(map(str, long_p_dur))          # Convert to string
    df.at[i, 'short_p_durations'] = ','.join(map(str, short_p_dur))        # Convert to string
    df.at[i,'Event_type'] = ','.join(map(str, pause_type))        # Convert to string
    df.at[i, 'long_p_count'] = len(long_p_dur)
    df.at[i, 'short_p_count'] = len(short_p_dur)
    df.at[i, 'long_p_durations_mean'] = long_p_mean
    df.at[i, 'short_p_durations_mean'] = short_p_mean
    df.at[i, 'long_p_durations_cv'] = long_p_cv
    df.at[i, 'short_p_durations_cv'] = short_p_cv
    
def calculate_speech_rate(df_all_1):
    # Calculate speech rate for both AWS and AWNS participants
    speech_rate_1 = df_all_1["Final_word_count"] / df_all_1["Total_Duration_Clipped_s"] * 60
    df_all_1["Speech_Rate"] = speech_rate_1


def merge_ssi_scores(df_all_1, ssi_scores):
    """
    Merge SSI scores for AWS participants into df_all_1.
    ID in SSI corresponds to File_Name in df_all_1 but without the .wav extension.
    """
    # Create a temporary column for matching (without modifying the original File_Name)
    df_all_1['File_ID'] = df_all_1['File_Name'].str.replace('.wav', '', regex=False)
    
    # Merge the SSI scores with df_all_1 based on File_ID and ID in SSI scores
    df_all_1 = pd.merge(df_all_1, ssi_scores, left_on='File_ID', right_on='ID', how='left')
    
    # Replace SSI scores with NaN for AWNS participants
    df_all_1.loc[df_all_1['Group'] == 'AWNS', ssi_scores.columns.difference(['ID'])] = pd.NA

    # Drop the temporary 'File_ID' column to keep the original data clean
    df_all_1 = df_all_1.drop(columns=['File_ID'])

    return df_all_1
