In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statistics
from scipy.stats import variation
from scipy.stats import mannwhitneyu
from scipy.stats import mannwhitneyu, shapiro
import matplotlib.pyplot as plt

In [None]:
# Load the csv file with all participant details 
#(Includes - participant details, SSI_score details,Speech_rate, and long and short pause metrics)
par_details = pd.read_csv('..../Stat_csv_files/AWNS_AWS_all_details.csv') #Change path as required

In [3]:
par_details.columns

Index(['File_Name', 'Group', 'Age', 'Sex', 'Total_words_expected',
       'Words_missing_at_end', 'Final_word_count', 'Speech Time Threshold_ms',
       'Pause Time Threshold_ms', 'Percent_Pause', 'Percent_Speech',
       'Total_Duration_Unclipped_s', 'Total_Duration_Clipped_s',
       'Speech_Duration_s', 'Pause_Duration_s', 'Speech_Events',
       'Pause_Events', 'Mean Speech_s', 'Std Dev Speech', 'CV Speech',
       'Mean Pause_s', 'Std Dev Pause', 'CV Pause', 'All_events_durations',
       'long_p_durations', 'short_p_durations', 'long_p_count',
       'short_p_count', 'long_p_durations_mean', 'short_p_durations_mean',
       'long_p_durations_cv', 'short_p_durations_cv', 'Event_type',
       'Speech_Rate', 'ID', 'No_of_stuttered_syllables',
       'No_of_total_syllables', 'Percent_syllables_stuttered', 'Score'],
      dtype='object')

In [4]:
def compare_groups_statistical_analysis(df, column_name, group_col, plot_title, y_label, 
                                        unit_conversion_factor=1, plot_figure=1, alternative='greater', 
                                        group_order='AWNS-AWS'):
    """
    Compare AWS and AWNS groups statistically using Mann-Whitney U test, calculate means, and plot significance.
    
    Parameters:
    df (DataFrame): The DataFrame containing participant data.
    column_name (str): The metric column to be compared.
    group_col (str): The column indicating the group ('AWS' or 'AWNS').
    plot_title (str): Title for the plot.
    y_label (str): Label for the y-axis.
    unit_conversion_factor (float, optional): Factor to convert units (e.g., ms to s). Default is 1 (no conversion).
    plot_figure (int, optional): 1 to plot the figure, 0 to skip plotting. Default is 1 (plot).
    alternative (str, optional): Specify 'less', 'greater', or 'two-sided' for the Mann-Whitney U test alternative hypothesis.
    group_order (str, optional): Order in which to compare groups for U calculation. Options are 'AWNS-AWS' (default) or 'AWS-AWNS'.
    
    Returns:
    None
    """
    
    # Split data into AWS and AWNS groups
    AWS_data = df[df[group_col] == 'AWS'][column_name] * unit_conversion_factor
    AWNS_data = df[df[group_col] == 'AWNS'][column_name] * unit_conversion_factor

    # Compute the mean for each group
    AWS_mean = AWS_data.mean()
    AWNS_mean = AWNS_data.mean()
    
    print(f"Mean for AWS: {AWS_mean}")
    print(f"Mean for AWNS: {AWNS_mean}")

    # Perform Shapiro-Wilk test for normality
    AWS_shapiro = shapiro(AWS_data)
    AWNS_shapiro = shapiro(AWNS_data)
    print(f"Shapiro-Wilk Test for AWS: {AWS_shapiro}")
    print(f"Shapiro-Wilk Test for AWNS: {AWNS_shapiro}")

    # Perform Mann-Whitney U test based on specified group order
    if group_order == 'AWNS-AWS':
        U, p_value = mannwhitneyu(AWNS_data, AWS_data, alternative=alternative)
        n1, n2 = len(AWNS_data), len(AWS_data)
    elif group_order == 'AWS-AWNS':
        U, p_value = mannwhitneyu(AWS_data, AWNS_data, alternative=alternative)
        n1, n2 = len(AWS_data), len(AWNS_data)
    else:
        raise ValueError("Invalid group_order. Use 'AWNS-AWS' or 'AWS-AWNS'.")

    print(f"Mann-Whitney U Test: U-statistic = {U}, p-value = {p_value}")

    # Compute rank-biserial correlation using Wendt's formula, adjusted based on the 'alternative'
    if alternative == 'greater':
        r = (2 * U) / (n1 * n2) - 1
    elif alternative == 'less' or alternative == 'two-sided':
        r = 1 - (2 * U) / (n1 * n2)
    
    print(f"Effect Size (Rank-Biserial Correlation): {r:.3f}")

    # Plot the results if requested
    if plot_figure == 1:
        data = [AWNS_data, AWS_data]
        fig = plt.figure(figsize=(5, 4))
        ax = fig.add_axes([0, 0, 1, 1])
        bp = ax.boxplot(data, patch_artist=True)
        
        # Set colors for the box plots
        colors = ['yellowgreen', 'cornflowerblue']
        for patch, color in zip(bp['boxes'], colors):
            patch.set_facecolor(color)
        for median in bp['medians']:
            median.set(color='black', linewidth=3)
        
        # Customize the plot
        plt.title(plot_title, fontsize=20)
        plt.xticks([1, 2], ['AWNS', 'AWS'], fontsize=20)
        plt.yticks(fontsize=15)
        plt.ylabel(y_label, fontsize=20)
        
        # Significance level and annotation
        bottom, top = ax.get_ylim()
        y_range = top - bottom
        bar_height = (y_range * 0.07 * -1) + top
        
        if p_value < 0.001:
            sig_symbol = '***'
        elif p_value < 0.01:
            sig_symbol = '**'
        elif p_value < 0.05:
            sig_symbol = '*'
        else:
            sig_symbol = ''
        
        text_height = bar_height - (y_range * 0.1)
        plt.text(1.5, text_height, sig_symbol, ha='center', va='bottom', color='k', fontsize=25)
        plt.show()


In [1]:
compare_groups_statistical_analysis(par_details, 'Speech_Rate', 'Group', 'Speech Rate', 'Time (s)', unit_conversion_factor=1, plot_figure=1, alternative='greater')


In [2]:
compare_groups_statistical_analysis(par_details, 'Pause_Duration_s', 'Group', 'Total Pause Time', 'Time (s)', unit_conversion_factor=1, plot_figure=1, alternative='greater',group_order='AWS-AWNS')


In [3]:
compare_groups_statistical_analysis(par_details, 'Pause_Events', 'Group', 'Pause Count', 'No. of Pauses', unit_conversion_factor=1, plot_figure=0, alternative='less')

In [4]:
compare_groups_statistical_analysis(par_details, 'Mean Pause_s', 'Group', 'Mean Pause Duration', 'Time (ms)', unit_conversion_factor=1000, plot_figure=1, alternative='greater',group_order='AWS-AWNS' )

In [5]:
compare_groups_statistical_analysis(par_details, 'Mean Speech_s', 'Group', 'Mean Vocal Duration', 'Time (ms)', unit_conversion_factor=1000, plot_figure=0, alternative='greater')


In [6]:
compare_groups_statistical_analysis(par_details, 'CV Pause', 'Group', 'Pause Duration Variability', 'Coefficient of Variation', unit_conversion_factor=1, plot_figure=0, alternative='less')


In [7]:
compare_groups_statistical_analysis(par_details, 'CV Speech', 'Group', 'Vocal Duration Variability', 'Coefficient of Variation', unit_conversion_factor=1, plot_figure=0, alternative='less')


## Long and Short Pause Stats

In [8]:
compare_groups_statistical_analysis(par_details, 'long_p_count', 'Group', 'Long Pause Count', 'No. of Pauses', unit_conversion_factor=1, plot_figure=1, alternative='two-sided')


In [9]:
compare_groups_statistical_analysis(par_details, 'short_p_count', 'Group', 'Short Pause Count', 'No. of Pauses', unit_conversion_factor=1, plot_figure=0, alternative='less')


In [10]:
compare_groups_statistical_analysis(par_details, 'long_p_durations_mean', 'Group', 'Mean Long Pauses Duration', 'Time (ms)', unit_conversion_factor=1000, plot_figure=0,alternative='two-sided')


In [11]:
compare_groups_statistical_analysis(par_details, 'short_p_durations_mean', 'Group', 'Mean Short Pauses Duration', 'Time (ms)', unit_conversion_factor=1000, plot_figure=1, alternative='less')


In [12]:
compare_groups_statistical_analysis(par_details, 'long_p_durations_cv', 'Group', 'Long Pauses Duration Variability', 'Coefficient of Variability', unit_conversion_factor=1, plot_figure=1, alternative='less')


In [13]:
compare_groups_statistical_analysis(par_details, 'short_p_durations_cv', 'Group', 'Short Pauses Duration Variability', 'Coefficient of Variability', unit_conversion_factor=1, plot_figure=0, alternative='less')


In [18]:
# Benjamini-Hochberg procedure to control for the False Discovery Rate (FDR) amid multiple testing

In [39]:
import numpy as np

def benjamini_hochberg(p_values, fdr=0.1):
    """
    Perform the Benjamini-Hochberg procedure for controlling the false discovery rate.
    
    Parameters:
    - p_values (list or np.array): Array of p-values to correct.
    - fdr (float): Desired false discovery rate.
    
    Returns:
    - np.array: Boolean array representing whether each test is significant.
    """
    p_values = np.array(p_values)
    n = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_p_values = p_values[sorted_indices]
    thresholds = fdr * np.arange(1, n + 1) / n
    
    reject = sorted_p_values <= thresholds
    reject_max_index = np.where(reject)[0].max() if any(reject) else 0
    reject[:reject_max_index + 1] = True
    
    corrected_sorted_indices = np.argsort(sorted_indices)
    return reject[corrected_sorted_indices]


In [None]:
p_values = [0.0026442424197612104, 0.03871779925087618, 0.0640899161502946, 0.041569327595409156, 0.35213730660438886, 0.5065832298410297, 0.003943189481594282, 0.030410340420505187, 0.48023395335819274, 0.6322436309019092, 0.022923590191909884, 0.12742047790214783, 0.7711412827862769]
is_significant = benjamini_hochberg(p_values, fdr=0.1)

print(f"P-values: {p_values}")
print(f"Significant at FDR=0.1: {is_significant}")

In [None]:
## Correlations with % SS

In [None]:
#%SS
aws_stuttered_percent = par_details[par_details['Group'] == 'AWS']['Percent_syllables_stuttered']

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression

def analyze_and_plot_correlation(x, y, x_label="X-axis", y_label="Y-axis", lp1=0.05, lp2=0.95):

    # Calculate the Pearson correlation coefficient
    corr_coeff, p_value = pearsonr(x, y)
    print("Correlation coefficient:", corr_coeff)
    print("p-value:", p_value)
    
    # Fit a linear regression model
    x1 = np.array(x).reshape(-1, 1)  # Reshape for scikit-learn compatibility
    model = LinearRegression().fit(x1, y)

    # Create a scatter plot of the data
    plt.figure(figsize=(10, 5))
    plt.scatter(x, y, alpha=0.7)

    # Add a regression line
    plt.plot(np.unique(x), model.predict(np.unique(x).reshape(-1, 1)), color='black')

    # Add axis labels
    plt.xlabel(x_label, fontsize=20)
    plt.ylabel(y_label, fontsize=20)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)

    # Add correlation coefficient and p-value inside the plot

    plt.text(lp1, lp2, f"r: {corr_coeff:.2f}, p-value: {'< .001' if p_value < 0.001 else f'{p_value:.3f}'.lstrip('0')}", 
             fontsize=15, ha='left', va='top', fontweight='bold', style='italic', transform=plt.gca().transAxes)


    plt.show()

    return {
        "correlation_coefficient": corr_coeff,
        "p_value": p_value,
        "regression_model": model
    }


In [None]:
total_pause_aws = par_details[par_details['Group'] == 'AWS']['Pause_Duration_s']
result1 = analyze_and_plot_correlation(total_pause_aws, aws_stuttered_percent, 
                                      x_label="Total Pause Time (sec)", 
                                      y_label="%Syllables Stuttered")


In [None]:
mean_vocal_duration_aws = par_details[par_details['Group'] == 'AWS']['Mean Speech_s']
result2 = analyze_and_plot_correlation(mean_vocal_duration_aws, aws_stuttered_percent, 
                                      x_label="Mean Vocal Duration (ms)", 
                                      y_label="%Syllables Stuttered",lp1 = 0.58,lp2 = 0.95)


In [None]:
speech_rate_aws = par_details[par_details['Group'] == 'AWS']['Speech_Rate']
result3 = analyze_and_plot_correlation(speech_rate_aws, aws_stuttered_percent, 
                                      x_label="Speech Rate (wpm)", 
                                      y_label="%Syllables Stuttered", lp1 = 0.58,lp2 = 0.95)


In [None]:
long_p_count_aws = par_details[par_details['Group'] == 'AWS']['long_p_count']
result4 = analyze_and_plot_correlation(long_p_count_aws, aws_stuttered_percent, 
                                      x_label="Long Pause Count", 
                                      y_label="%Syllables Stuttered")


In [None]:
short_p_count_aws = par_details[par_details['Group'] == 'AWS']['short_p_count']
result5 = analyze_and_plot_correlation(short_p_count_aws, aws_stuttered_percent, 
                                      x_label="Short Pause Count", 
                                      y_label="%Syllables Stuttered")


In [None]:
## Combined Plot

In [None]:
long_p_count_awns = par_details[par_details['Group'] == 'AWNS']['long_p_count']
short_p_count_awns = par_details[par_details['Group'] == 'AWNS']['short_p_count']

In [None]:
long_p_count_aws = np.array(long_p_count_aws)
short_p_count_aws = np.array(short_p_count_aws)
long_p_count_awns = np.array(long_p_count_awns)
short_p_count_awns = np.array(short_p_count_awns)

In [None]:
x_p = [16, 24, 21, 19, 16, 33, 30, 24, 83, 59, 16, 32, 50, 22, 31, 21, 27]
y_p = [32, 47, 35, 24, 32, 56, 70, 65, 169, 113, 25, 32, 99, 29, 55, 21, 37]

xy_1 = [10,10,10,10,10,50,10,10,10,10,-20,10,10,-1,10,10,10]
xy_2 = [80,50,50,300,50,70,50,50,10,50,130,50,50,240,50,10,50]

plt.figure(figsize=(10,10))
plt.scatter(short_p_count_awns,long_p_count_awns,color='yellowgreen', s=100,label='AWNS')

plt.scatter(short_p_count_aws,long_p_count_aws,color= 'cornflowerblue', s=100,label='AWS')
y_names = aws_stuttered_percent

y_names= round(y_names,2)
plt.legend(fontsize=20)

    
for i, txt in enumerate(y_names):
    # Initialize the annotation position to the data point coordinates
    x_pos, y_pos = short_p_count_aws[i], long_p_count_aws[i]
    
#     # Check for overlapping annotations
    for j in range(i):
        if abs(short_p_count_aws[i] - short_p_count_aws[j]) < 0.1 and abs(long_p_count_aws[i] - long_p_count_aws[j]) < 0.1:
            # If the annotation overlaps with another annotation, adjust the position
            x_pos += 0.5
            y_pos += 0.5
    
    # Add the annotation to the plot with an arrow pointing to the data point
    plt.annotate(txt, (x_p[i], y_p[i]),
                xytext=(xy_1[i],xy_2[i]),
                fontsize=15,textcoords='offset points',
                ha='center',
                arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='black', lw=1,  shrinkA=0, shrinkB=0, relpos=(0,0)))

plt.title('Long and Short Pauses (% Stuttered Syllables)',fontsize=20)
plt.xlabel('Short Pause Count',fontsize=20)
plt.ylabel('Long Pause Count',fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.show()