In [7]:
import pandas as pd
import os

# Check the current working directory
print("Current Working Directory:", os.getcwd())

# Update these file paths based on the location of your CSV files
audio_files_path = "csv/combined_ratings_for_both_sets.csv"  # Replace with full path if not in current directory
combined_praat_path = "csv/combined_praat.csv"         # Replace with full path if not in current directory

# Read the CSV files
audio_files_df = pd.read_csv(audio_files_path)
combined_praat_df = pd.read_csv(combined_praat_path)

# Drop the "Fluency" column from combined_praat_df
if "Fluency" in combined_praat_df.columns:
    combined_praat_df = combined_praat_df.drop(columns=["Fluency"])

# Merge the two dataframes on the "name" column
merged_df = pd.merge(audio_files_df, combined_praat_df, on="name", how="inner")

merged_df


Current Working Directory: c:\Users\zrack\fluency_classification


Unnamed: 0,name,Fluency,Fluency2,nsyll,npause,dur(s),phonationtime(s),speechrate(nsyll/dur),articulation_rate(nsyll/phonationtime),ASD(speakingtime/nsyll),nrFP,tFP(s)
0,audio_33,A2,C1,181,68,167.11,93.72,1.08,1.93,0.518,55,17.529
1,audio_33,A2,C1,110,0,44.00,44.00,2.50,2.50,0.400,10,2.297
2,audio_92,B1,B1,221,11,67.11,58.87,3.29,3.75,0.266,89,17.050
3,audio_92,B1,B1,27,2,11.00,7.16,2.45,3.77,0.265,4,0.875
4,audio_100,B1,A2,76,13,34.00,19.89,2.24,3.82,0.262,22,5.078
...,...,...,...,...,...,...,...,...,...,...,...,...
184,audio_2133,A1,A2,51,12,38.33,15.48,1.33,3.30,0.303,18,4.123
185,audio_2144,B1,C1,29,4,9.85,6.91,2.94,4.20,0.238,6,1.075
186,audio_2172,A2,B2,67,12,23.96,17.01,2.80,3.94,0.254,15,2.875
187,audio_2081,A2,B2,116,16,42.36,29.36,2.74,3.95,0.253,21,5.009


In [8]:
import pandas as pd

# Load the dataset
file_path = 'audio/Human Labeled Proficiency'
# data = pd.read_csv(file_path)

data=merged_df
# Take a random sample of 10% of the lines
sampled_data = data

# Specify the relevant columns
columns = [
    "speechrate(nsyll/dur)", "articulation_rate(nsyll/phonationtime)", 
    "npause", "dur(s)", "ASD(speakingtime/nsyll)", "nrFP"
]

# Map column names to labels for the formatted output
column_labels = {
    "speechrate(nsyll/dur)": "Speech Rate",
    "articulation_rate(nsyll/phonationtime)": "Articulation Rate",
    "npause": "Number of Pauses",
    "dur(s)": "Duration",
    "ASD(speakingtime/nsyll)": "ASD (speaking time/nsyll)",
    "nrFP": "Number of Filled Pauses"
}

# Format the strings for each sampled row
sampled_strings = sampled_data[columns].apply(
    lambda row: '\n'.join([f"{column_labels[col]}: {row[col]}" for col in columns]), axis=1
).tolist()


In [9]:
# Define the function to sample based on fluency levels
def sample_balanced_groups(data, fluency_col_1, fluency_col_2, groups):
    """
    Samples balanced groups from the dataset based on fluency levels.

    Args:
        data (pd.DataFrame): The dataset to sample from.
        fluency_col_1 (str): The name of the first fluency column.
        fluency_col_2 (str): The name of the second fluency column.
        groups (dict): A dictionary defining the fluency groups to sample from.

    Returns:
        pd.DataFrame: A DataFrame containing balanced samples from each group.
    """
    # Filter rows for each fluency group
    sampled_groups = []
    for group_name, group_values in groups.items():
        group_data = data[
            (data[fluency_col_1].isin(group_values)) | (data[fluency_col_2].isin(group_values))
        ]
        sampled_groups.append(group_data)

    # Find the minimum group size
    min_group_size = min(len(group) for group in sampled_groups)

    # Sample the same number of rows from each group
    balanced_samples = pd.concat([group.sample(min_group_size, random_state=42) for group in sampled_groups])
    return balanced_samples.reset_index(drop=True)

# Define fluency groups
fluency_groups = {
    "A1/A2": ["A1", "A2"],
    "B1/B2": ["B1", "B2"],
    "C1/C2": ["C1", "C2"]
}

# Apply the function to sample balanced groups
balanced_sampled_data = sample_balanced_groups(
    data=data,
    fluency_col_1="Fluency",
    fluency_col_2="Fluency2",
    groups=fluency_groups
)

print(balanced_sampled_data)


           name Fluency Fluency2  nsyll  npause  dur(s)  phonationtime(s)  \
0     audio_925      A1       B2     35       1   13.05              8.74   
1    audio_1309      A2       C1    103      29   44.91             24.31   
2    audio_1195      A1       B1     28       7   11.83              6.30   
3    audio_1801      A1       B2     46      17   33.60             12.61   
4    audio_1509      B1       A2     21       2    9.67              5.17   
..          ...     ...      ...    ...     ...     ...               ...   
169  audio_1502      A2       C1     95      15   40.54             25.20   
170   audio_629      B2       C1     19       5    8.60              5.83   
171  audio_1066      B2       C1     67       7   22.83             16.66   
172  audio_1811      B2       C2     47       6   15.21             11.04   
173  audio_1243      C1       C1    295      53  106.84             73.84   

     speechrate(nsyll/dur)  articulation_rate(nsyll/phonationtime)  \
0    

In [10]:
from openai import OpenAI

# Set OpenAI API key
openai = OpenAI(api_key="")


In [11]:
# Add Fluency and Fluency2 to the sampled dataframe
columns_with_fluency = columns + ["Fluency", "Fluency2"]

In [12]:
from openai import OpenAI
from pydantic import BaseModel
from scipy.stats import kendalltau

# Define the structured output model for responses
class FluencyJudgment(BaseModel):
    logic: str
    speech_rate: str
    articulation_rate: str
    number_of_pauses: str
    asd: str
    number_of_filled_pauses: str
    fluency_level: str

# Fluency mapping for CEFR levels
fluency_mapping = {
    "A1": "Beginner",
    "A2": "Beginner",
    "B1": "Intermediate",
    "B2": "Intermediate",
    "C1": "Advanced",
    "C2": "Advanced"
}

# Numeric mapping for fluency levels
fluency_numeric_mapping = {"beginner": 0, "intermediate": 1, "advanced": 2}
numeric_to_fluency = {0: "beginner", 1: "intermediate", 2: "advanced"}

# Ensure all fluency judgments and actual ratings are lowercase
ai_fluency_judgments = []
calculated_fluency_levels = []
actual_fluency_1 = []
actual_fluency_2 = []
category_analysis = []

# Function to calculate final fluency level based on category averages
def calculate_final_fluency(categories):
    numeric_values = [fluency_numeric_mapping[category.lower()] for category in categories]
    average_value = round(sum(numeric_values) / len(numeric_values))
    return numeric_to_fluency[average_value]

# Keep only rows where Fluency and Fluency2 agree after mapping
filtered_data = balanced_sampled_data[
    balanced_sampled_data.apply(
        lambda row: fluency_mapping.get(row['Fluency'], "unknown").lower() == fluency_mapping.get(row['Fluency2'], "unknown").lower(),
        axis=1
    )
]
print(filtered_data)

           name Fluency Fluency2  nsyll  npause  dur(s)  phonationtime(s)  \
11   audio_1958      A1       A2     11       2   20.96              3.02   
12   audio_1331      A1       A2     13       1   13.66              3.68   
13   audio_1918      A1       A1     19       4   10.06              5.34   
17   audio_1957      A1       A2     27       7   31.55              8.42   
18    audio_725      A1       A2     86      20   38.31             24.96   
25   audio_1675      A1       A2     53      19   38.08             16.38   
26    audio_728      A1       A2     49       4   17.93             12.51   
48   audio_1674      A1       A2     30      12   25.19              9.01   
51   audio_1668      A1       A2     33      10   22.22              9.76   
53   audio_1238      A1       A2     99      21   44.94             27.24   
57   audio_1702      A1       A2     10       3   47.30              2.75   
61    audio_788      B1       B2     53       9   22.46             15.68   

In [13]:
import random
import pandas as pd
from sklearn.model_selection import train_test_split

# Perform a train/test split
train_data, test_data = train_test_split(filtered_data, test_size=0.7, random_state=42)

# Prepare "examples" list from the training data for the LLM
examples = []
for _, row in train_data.iterrows():
    # Normalize "number_of_pauses" and "number_of_filled_pauses" by "duration"
    duration = row["dur(s)"]  # Assuming the duration column is named "dur(s)"
    normalized_number_of_pauses = row["npause"] / duration if duration else 0
    normalized_number_of_filled_pauses = row["nrFP"] / duration if duration else 0

    # Create the input string
    line = '\n'.join([
        f"speech_rate: {row['speechrate(nsyll/dur)']:.3f}",
        f"articulation_rate: {row['articulation_rate(nsyll/phonationtime)']:.3f}",
        f"number_of_pauses/duration: {normalized_number_of_pauses:.3f}",
        f"asd: {row['ASD(speakingtime/nsyll)']:.3f}",
        f"number_of_filled_pauses/duration: {normalized_number_of_filled_pauses:.3f}"
    ])
    fluency1_mapped = fluency_mapping.get(row['Fluency'], "unknown").lower()
    fluency2_mapped = fluency_mapping.get(row['Fluency2'], "unknown").lower()
    fluency_actual = {
        "Fluency_1": fluency1_mapped,
        "Fluency_2": fluency2_mapped
    }
    # Append the example to the list
    examples.append({
        "input": line,
        "fluency_1": fluency1_mapped,
        "fluency_2": fluency2_mapped
    })

# Debug print to check the examples
print("Examples for LLM:", examples[:5])  # Show the first 5 examples as a sample

# Prepare the test data for evaluation
sample_data = test_data  # The test data will be used for further processing

# Remaining code for processing the test data
# Variables to track matches and tau calculations
matches_fluency_1 = 0
matches_fluency_2 = 0
calculated_fluency_levels = []
category_analysis = []
ai_fluency_judgments = []
actual_fluency_1 = []
actual_fluency_2 = []
category_numeric_values = {name: [] for name in ["speech_rate", "articulation_rate", "number_of_pauses", "asd", "number_of_filled_pauses"]}

# Process each row in the sampled dataset (test data)
for _, row in sample_data[columns_with_fluency].iterrows():
    # Format the input string
    line = '\n'.join([
        f"{column_labels[col]}: {row[col]}" 
        for col in columns if col != 'dur(s)'
    ])
    fluency1_mapped = fluency_mapping.get(row['Fluency'], "unknown").lower()
    fluency2_mapped = fluency_mapping.get(row['Fluency2'], "unknown").lower()
    fluency_actual = {
        "Fluency_1": fluency1_mapped,
        "Fluency_2": fluency2_mapped
    }
    print(line)


Examples for LLM: [{'input': 'speech_rate: 1.810\narticulation_rate: 3.850\nnumber_of_pauses/duration: 0.319\nasd: 0.260\nnumber_of_filled_pauses/duration: 0.745', 'fluency_1': 'intermediate', 'fluency_2': 'intermediate'}, {'input': 'speech_rate: 3.300\narticulation_rate: 4.250\nnumber_of_pauses/duration: 0.089\nasd: 0.235\nnumber_of_filled_pauses/duration: 0.089', 'fluency_1': 'advanced', 'fluency_2': 'advanced'}, {'input': 'speech_rate: 2.730\narticulation_rate: 3.920\nnumber_of_pauses/duration: 0.223\nasd: 0.255\nnumber_of_filled_pauses/duration: 0.725', 'fluency_1': 'beginner', 'fluency_2': 'beginner'}, {'input': 'speech_rate: 2.210\narticulation_rate: 3.920\nnumber_of_pauses/duration: 0.426\nasd: 0.255\nnumber_of_filled_pauses/duration: 0.256', 'fluency_1': 'intermediate', 'fluency_2': 'intermediate'}, {'input': 'speech_rate: 2.760\narticulation_rate: 4.000\nnumber_of_pauses/duration: 0.496\nasd: 0.250\nnumber_of_filled_pauses/duration: 0.328', 'fluency_1': 'advanced', 'fluency_2'

In [14]:
prompt = f"""

    You are an impartial evaluator tasked with assessing the English fluency level of English learners speaking in conversational English. You will be provided with features describing their speech fluency, pausing, and pronunciation.

    ### Context:
    English fluency is assessed across five key features: speech rate, articulation rate, number of pauses, average syllable duration (ASD), and number of filled pauses. Each feature is categorized as beginner, intermediate, or advanced based on specific criteria.

    ### Scoring Criteria:

    - **Speech Rate (nsyll/dur):**
    - **Beginner**: Very slow, frequent long pauses, unnatural rhythm.
    - **Intermediate**: Moderate, some pauses, mostly natural rhythm but occasionally disrupted.
    - **Advanced**: Consistently natural rhythm, smooth, and steady.

    - **Articulation Rate (nsyll/phonationtime):**
    - **Beginner**: Low, with frequent disruptions and interruptions.
    - **Intermediate**: Moderate, with occasional disruptions but mostly smooth.
    - **Advanced**: High, fluid articulation with minimal interruptions.

    - **Average Syllable Duration (ASD) (speakingtime/nsyll):**
    - **Beginner**: High, long syllable durations, indicating unnatural rhythm.
    - **Intermediate**: Moderate, occasional unevenness in timing.
    - **Advanced**: Low, smooth, natural timing for syllables.

    ### Evaluation Process:
    1. **Analyze the Features**:
    - Assign each feature a level: beginner, intermediate, or advanced based on the rubric.
    2. **Determine the Overall Fluency Level**:
    - If most categories are beginner, classify as "Beginner."
    - If most categories are intermediate, classify as "Intermediate."
    - If most categories are advanced, classify as "Advanced."
    - In case of a tie, prioritize "speech rate" and "articulation rate."

    ### Examples for Reference:
    {examples}


    Output Format:

    Produce a JSON object with the following structure:
        "logic": your logic in arriving at your answer,
        "speech_rate": level,
        "articulation_rate": level,
        "asd": level,
        "fluency_level": "overall_level"
        
"""

In [15]:
examples

[{'input': 'speech_rate: 1.810\narticulation_rate: 3.850\nnumber_of_pauses/duration: 0.319\nasd: 0.260\nnumber_of_filled_pauses/duration: 0.745',
  'fluency_1': 'intermediate',
  'fluency_2': 'intermediate'},
 {'input': 'speech_rate: 3.300\narticulation_rate: 4.250\nnumber_of_pauses/duration: 0.089\nasd: 0.235\nnumber_of_filled_pauses/duration: 0.089',
  'fluency_1': 'advanced',
  'fluency_2': 'advanced'},
 {'input': 'speech_rate: 2.730\narticulation_rate: 3.920\nnumber_of_pauses/duration: 0.223\nasd: 0.255\nnumber_of_filled_pauses/duration: 0.725',
  'fluency_1': 'beginner',
  'fluency_2': 'beginner'},
 {'input': 'speech_rate: 2.210\narticulation_rate: 3.920\nnumber_of_pauses/duration: 0.426\nasd: 0.255\nnumber_of_filled_pauses/duration: 0.256',
  'fluency_1': 'intermediate',
  'fluency_2': 'intermediate'},
 {'input': 'speech_rate: 2.760\narticulation_rate: 4.000\nnumber_of_pauses/duration: 0.496\nasd: 0.250\nnumber_of_filled_pauses/duration: 0.328',
  'fluency_1': 'advanced',
  'flue

In [139]:
import random
from scipy.stats import kendalltau

# Check the size of filtered_data
if len(test_data) <= 20:
    sample_data = test_data  # Use all rows if there are 20 or fewer
else:
    sample_data = test_data.sample(n=20, random_state=42)  # Random sample of 20 rows

# Variables to track matches and tau calculations
matches_fluency_1 = 0
matches_fluency_2 = 0
calculated_fluency_levels = []
category_analysis = []
ai_fluency_judgments = []
actual_fluency_1 = []
actual_fluency_2 = []
category_numeric_values = {name: [] for name in [
    "speech_rate", 
    "articulation_rate", 
    "asd"
]}

# Process each row in the sampled dataset
for _, row in sample_data.iterrows():
    duration = row['dur(s)']  # Get the duration of the audio

    # Normalize only selected categories by duration
    normalized_categories = {
        "speech_rate": row["speechrate(nsyll/dur)"],  # Not normalized
        "articulation_rate": row["articulation_rate(nsyll/phonationtime)"],  # Not normalized
        "asd": row["ASD(speakingtime/nsyll)"]  # Not normalized
    }

    # Add normalized values to category_numeric_values
    for category_name, category_value in normalized_categories.items():
        category_numeric_values[category_name].append(category_value)

    # Create the input string for the LLM
    line = '\n'.join([
        f"{key}: {value:.3f}"  # Format values to 3 decimal places
        for key, value in normalized_categories.items()
    ])

    fluency1_mapped = fluency_mapping.get(row['Fluency'], "unknown").lower()
    fluency2_mapped = fluency_mapping.get(row['Fluency2'], "unknown").lower()
    fluency_actual = {
        "Fluency_1": fluency1_mapped,
        "Fluency_2": fluency2_mapped
    }

    # Use OpenAI's Structured Outputs feature
    response = openai.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"{prompt}"},
            {"role": "user", "content": f"{line}"}
        ],
        response_format=FluencyJudgment,
        temperature=0
    )

    response = response.choices[0].message.parsed
    ai_judgment = response.fluency_level.lower()
    logic_value = response.logic  # Extract logic for printing later
    ai_fluency_judgments.append(ai_judgment)
    actual_fluency_1.append(fluency_actual["Fluency_1"])
    actual_fluency_2.append(fluency_actual["Fluency_2"])

    # Analyze if the AI fluency level aligns with categories
    correct_categories = [
        response.speech_rate,
        response.articulation_rate,
        normalized_categories["asd"],
        response.fluency_level
    ]

    category_analysis.append({
        "AI Fluency": ai_judgment,
        "Categories": correct_categories,
        "Logic": logic_value,  # Add logic for later analysis
        "Final Match": ai_judgment == fluency_actual["Fluency_1"] or ai_judgment == fluency_actual["Fluency_2"]
    })

    # Calculate fluency level based on categorical fluency values only
    calculated_fluency = calculate_final_fluency([response.fluency_level])
    calculated_fluency_levels.append(calculated_fluency)

# Convert fluency levels to numeric values for Kendall's tau
ai_fluency_numeric = [fluency_numeric_mapping.get(level, -1) for level in ai_fluency_judgments]
calculated_fluency_numeric = [fluency_numeric_mapping.get(level, -1) for level in calculated_fluency_levels]
fluency_1_numeric = [fluency_numeric_mapping.get(level, -1) for level in actual_fluency_1]
fluency_2_numeric = [fluency_numeric_mapping.get(level, -1) for level in actual_fluency_2]

# Calculate Kendall's Tau
tau_fluency_1 = (
    kendalltau(calculated_fluency_numeric, fluency_1_numeric)[0]
    if len(set(calculated_fluency_numeric)) > 1 and len(set(fluency_1_numeric)) > 1
    else "Undefined (Insufficient Variance)"
)

tau_fluency_2 = (
    kendalltau(calculated_fluency_numeric, fluency_2_numeric)[0]
    if len(set(calculated_fluency_numeric)) > 1 and len(set(fluency_2_numeric)) > 1
    else "Undefined (Insufficient Variance)"
)

# Calculate Kendall's Tau for each category
category_taus = {}
for category_name, category_values in category_numeric_values.items():
    if len(set(category_values)) > 1 and len(set(fluency_1_numeric)) > 1:
        category_taus[category_name] = kendalltau(category_values, fluency_1_numeric)[0]
    else:
        category_taus[category_name] = "Undefined (Insufficient Variance)"


# Print detailed analysis for all samples
category_names = ["speech_rate", "articulation_rate", "asd"]
print("----- Detailed Analysis -----")
for i, (analysis, calc_fluency) in enumerate(zip(category_analysis, calculated_fluency_levels)):
    ai_fluency = analysis['AI Fluency']
    categories = analysis['Categories']
    logic = analysis.get('Logic', 'Not Provided')  # Logic value if available
    fluency_1 = actual_fluency_1[i]
    fluency_2 = actual_fluency_2[i]
    final_match = analysis['Final Match']

    print(f"Sample {i + 1}:")
    print(f"AI Fluency: {ai_fluency}")
    print(f"Calculated Fluency: {calc_fluency}")
    print("Categories:")
    for name, value in zip(category_names, categories[:-1]):  # Skip the redundant fluency_level
        if isinstance(value, (int, float)):  # Check if value is numeric
            print(f"  {name}: {value:.3f}")
        else:  # Handle string values
            print(f"  {name}: {value}")
    print(f"Logic: {logic}")  # Print logic value
    print(f"Fluency 1 (Actual): {fluency_1}")
    print(f"Fluency 2 (Actual): {fluency_2}")
    print(f"Final Match with Actual Fluency: {final_match}")
    print("-" * 50)

# Print final statistics
print("----- Final Statistical Summary -----")
print(f"Matches with Fluency 1: {sum(ai == human for ai, human in zip(ai_fluency_judgments, actual_fluency_1))}")
print(f"Matches with Fluency 2: {sum(ai == human for ai, human in zip(ai_fluency_judgments, actual_fluency_2))}")
print(f"Matches with Calculated Fluency: {sum(calc == human for calc, human in zip(calculated_fluency_levels, actual_fluency_1 + actual_fluency_2))}")
print(f"Kendall's Tau with Fluency 1: {tau_fluency_1}")
print(f"Kendall's Tau with Fluency 2: {tau_fluency_2}")
# Print category-wise Kendall's Tau
print("Category-wise Kendall's Tau:")
for category_name, tau_value in category_taus.items():
    print(f"  {category_name}: {tau_value}")
print(f"Total Correct Final Matches with Actual Fluency: {sum(1 for a in category_analysis if a['Final Match'])} / {len(category_analysis)}")
print("----- End of Analysis -----")


----- Detailed Analysis -----
Sample 1:
AI Fluency: intermediate
Calculated Fluency: intermediate
Categories:
  speech_rate: intermediate
  articulation_rate: intermediate
  asd: 0.296
Logic: The speech rate of 2.520 indicates a moderate pace, which aligns with the intermediate level. The articulation rate of 3.380 is also moderate, suggesting some smoothness but with occasional disruptions, placing it in the intermediate category as well. The average syllable duration (ASD) of 0.296 is moderate, indicating some unevenness in timing, which again fits the intermediate classification. Since all three features are categorized as intermediate, the overall fluency level is classified as intermediate.
Fluency 1 (Actual): intermediate
Fluency 2 (Actual): intermediate
Final Match with Actual Fluency: True
--------------------------------------------------
Sample 2:
AI Fluency: beginner
Calculated Fluency: beginner
Categories:
  speech_rate: beginner
  articulation_rate: intermediate
  asd: 0.2

In [16]:
import pandas as pd

# Load the dataset
file_path = 'audio/totrain.csv'
data = pd.read_csv(file_path)

# Drop the 'Fluency' column but keep everything else
columns_to_exclude = ["Fluency"]
filtered_data = data.drop(columns=columns_to_exclude)

# Prepare a list to store rows with GPT-assigned fluency labels
output_data = []

# Iterate through each row and assign new fluency labels
for _, row in filtered_data.iterrows():
    # Normalize selected categories for input to the model
    normalized_categories = {
        "speech_rate": row["speechrate(nsyll/dur)"],
        "articulation_rate": row["articulation_rate(nsyll/phonationtime)"],
        "asd": row["ASD(speakingtime/nsyll)"]
    }

    # Create the input string for the LLM
    line = '\n'.join([
        f"{key}: {value:.3f}"  # Format values to 3 decimal places
        for key, value in normalized_categories.items()
    ])

    # Send input to the AI model
    response = openai.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"{prompt}"},
            {"role": "user", "content": f"{line}"}
        ],
        response_format=FluencyJudgment,
        temperature=0
    )

    # Extract GPT-assigned fluency level
    gpt_fluency = response.choices[0].message.parsed.fluency_level

    # Append the new row with all original data and GPT fluency
    new_row = row.to_dict()
    new_row["GPT_Fluency"] = gpt_fluency
    output_data.append(new_row)

# Convert the list of rows into a new DataFrame
output_df = pd.DataFrame(output_data)

# Save the new DataFrame to a CSV file
output_csv_path = 'audio/gpt_judged.csv'
output_df.to_csv(output_csv_path, index=False)

print(f"New fluency labels assigned and saved to {output_csv_path}.")


New fluency labels assigned and saved to audio/gpt_judged.csv.


In [20]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np

# Load the GPT-labeled dataset
file_path = 'audio/gpt_judged.csv'
data = pd.read_csv(file_path)

# Convert all labels in 'GPT_Fluency' to lowercase for consistency
data["GPT_Fluency"] = data["GPT_Fluency"].str.lower()

# Extract features and target
features = ["speechrate(nsyll/dur)", "articulation_rate(nsyll/phonationtime)", "ASD(speakingtime/nsyll)"]
target = "GPT_Fluency"

# Prepare the dataset
X = data[features]
y = data[target]

# Encode target labels (beginner, intermediate, advanced) into numeric values
label_mapping = {"beginner": 0, "intermediate": 1, "advanced": 2}
y = y.map(label_mapping)

# Perform 5-fold cross-validation with accuracy and F1 scoring
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform cross-validation for accuracy and F1 score
cv_results = cross_validate(clf, X, y, cv=5, scoring=['accuracy', 'f1_macro'], return_train_score=True)

# Print cross-validation results
print("Cross-Validation Results:")
print(f"CV Accuracy Scores: {cv_results['test_accuracy']}")
print(f"Mean CV Accuracy: {cv_results['test_accuracy'].mean():.4f}")
print(f"Standard Deviation of CV Accuracy: {cv_results['test_accuracy'].std():.4f}")
print(f"CV F1 Macro Scores: {cv_results['test_f1_macro']}")
print(f"Mean CV F1 Macro: {cv_results['test_f1_macro'].mean():.4f}")
print(f"Standard Deviation of CV F1 Macro: {cv_results['test_f1_macro'].std():.4f}")

# Split the data into training and testing sets for further analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the Random Forest Classifier on the training set
clf.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = clf.predict(X_test)

# Calculate F1 score on test set
f1 = f1_score(y_test, y_pred, average='macro')

print("\nTest Set Results:")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_mapping.keys()))
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test Set F1 Macro Score: {f1:.4f}")

# Feature importance
print("\nFeature Importances:")
for feature, importance in zip(features, clf.feature_importances_):
    print(f"{feature}: {importance:.4f}")


Cross-Validation Results:
CV Accuracy Scores: [0.9        0.98333333 0.95       0.88333333 0.9       ]
Mean CV Accuracy: 0.9233
Standard Deviation of CV Accuracy: 0.0374
CV F1 Macro Scores: [0.90144078 0.98580327 0.95101423 0.88568026 0.88333333]
Mean CV F1 Macro: 0.9215
Standard Deviation of CV F1 Macro: 0.0404

Test Set Results:
Classification Report:
              precision    recall  f1-score   support

    beginner       1.00      0.91      0.95        11
intermediate       0.94      0.97      0.95        30
    advanced       0.95      0.95      0.95        19

    accuracy                           0.95        60
   macro avg       0.96      0.94      0.95        60
weighted avg       0.95      0.95      0.95        60

Test Set Accuracy: 0.9500
Test Set F1 Macro Score: 0.9502

Feature Importances:
speechrate(nsyll/dur): 0.4149
articulation_rate(nsyll/phonationtime): 0.3106
ASD(speakingtime/nsyll): 0.2745


In [21]:
import pandas as pd

# Load the dataset to make predictions
inference_file_path = 'audio/totrain.csv'  # Path to the dataset for inference
inference_data = pd.read_csv(inference_file_path)

# Extract features for inference
features = ["speechrate(nsyll/dur)", "articulation_rate(nsyll/phonationtime)", "ASD(speakingtime/nsyll)"]
X_inference = inference_data[features]

# Check if all required features are present
missing_features = [feature for feature in features if feature not in inference_data.columns]
if missing_features:
    raise ValueError(f"Missing features in dataset: {missing_features}")

# Use the trained Random Forest model to make predictions
# Note: Ensure the model has been trained before running this step
predicted_labels = clf.predict(X_inference)

# Map numeric predictions back to their respective labels
label_mapping = {0: "beginner", 1: "intermediate", 2: "advanced"}
predicted_fluency = [label_mapping[label] for label in predicted_labels]

# Add predictions to the dataset
inference_data['Predicted_Fluency'] = predicted_fluency

# Save the results to a new CSV file
output_file_path = 'inference_results.csv'
inference_data.to_csv(output_file_path, index=False)

print(f"Inference completed. Results saved to {output_file_path}")


Inference completed. Results saved to inference_results.csv


In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("sylviali/EDEN_ASR_Data")

import random

# Filter for English audio samples longer than 8 seconds
filtered_data = [sample for sample in dataset if sample['language'] == 'en' and sample['duration'] > 8]

# Randomly sample 20 entries
random.seed(42)  # For reproducibility
sampled_data = random.sample(filtered_data, 20)

# Assuming 'model' is your trained Random Forest model
predictions = clf.predict(extracted_features)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Load the GPT-labeled dataset
file_path = 'audio/gpt_judged.csv'
data = pd.read_csv(file_path)

# Convert all labels in 'GPT_Fluency' to lowercase for consistency
data["GPT_Fluency"] = data["GPT_Fluency"].str.lower()

# Extract features and target
features = ["speechrate(nsyll/dur)", "articulation_rate(nsyll/phonationtime)", "ASD(speakingtime/nsyll)"]
target = "GPT_Fluency"

# Prepare the dataset
X = data[features]
y = data[target]

# Encode target labels (beginner, intermediate, advanced) into numeric values
label_mapping = {"beginner": 0, "intermediate": 1, "advanced": 2}
y = y.map(label_mapping)

# Define models to train
models = {
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Iterate through models
for model_name, model in models.items():
    print(f"\nTraining and Evaluating {model_name} Model")

    # Perform 5-fold cross-validation
    cv_results = cross_validate(model, X, y, cv=5, scoring=['accuracy', 'f1_macro'], return_train_score=True)

    # Print cross-validation results
    print("Cross-Validation Results:")
    print(f"CV Accuracy Scores: {cv_results['test_accuracy']}")
    print(f"Mean CV Accuracy: {cv_results['test_accuracy'].mean():.4f}")
    print(f"Standard Deviation of CV Accuracy: {cv_results['test_accuracy'].std():.4f}")
    print(f"CV F1 Macro Scores: {cv_results['test_f1_macro']}")
    print(f"Mean CV F1 Macro: {cv_results['test_f1_macro'].mean():.4f}")
    print(f"Standard Deviation of CV F1 Macro: {cv_results['test_f1_macro'].std():.4f}")

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Train the model on the training set
    model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = model.predict(X_test)

    # Calculate F1 score on test set
    f1 = f1_score(y_test, y_pred, average='macro')

    print("\nTest Set Results:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_mapping.keys()))
    print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Test Set F1 Macro Score: {f1:.4f}")

    # Feature importance (for XGBoost only)
    if model_name == "XGBoost":
        print("\nFeature Importances:")
        for feature, importance in zip(features, model.feature_importances_):
            print(f"{feature}: {importance:.4f}")



Training and Evaluating SVM Model
Cross-Validation Results:
CV Accuracy Scores: [0.86666667 0.93333333 0.93333333 0.91666667 0.85      ]
Mean CV Accuracy: 0.9000
Standard Deviation of CV Accuracy: 0.0350
CV F1 Macro Scores: [0.86583179 0.93559458 0.92967307 0.92171686 0.80299071]
Mean CV F1 Macro: 0.8912
Standard Deviation of CV F1 Macro: 0.0506

Test Set Results:
Classification Report:
              precision    recall  f1-score   support

    beginner       1.00      0.73      0.84        11
intermediate       0.91      0.97      0.94        30
    advanced       0.95      1.00      0.97        19

    accuracy                           0.93        60
   macro avg       0.95      0.90      0.92        60
weighted avg       0.94      0.93      0.93        60

Test Set Accuracy: 0.9333
Test Set F1 Macro Score: 0.9173

Training and Evaluating XGBoost Model


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-Validation Results:
CV Accuracy Scores: [0.88333333 0.96666667 0.98333333 0.88333333 0.91666667]
Mean CV Accuracy: 0.9267
Standard Deviation of CV Accuracy: 0.0416
CV F1 Macro Scores: [0.88401471 0.97183908 0.98580327 0.8794996  0.90337935]
Mean CV F1 Macro: 0.9249
Standard Deviation of CV F1 Macro: 0.0450

Test Set Results:
Classification Report:
              precision    recall  f1-score   support

    beginner       0.91      0.91      0.91        11
intermediate       0.97      0.93      0.95        30
    advanced       0.95      1.00      0.97        19

    accuracy                           0.95        60
   macro avg       0.94      0.95      0.94        60
weighted avg       0.95      0.95      0.95        60

Test Set Accuracy: 0.9500
Test Set F1 Macro Score: 0.9442

Feature Importances:
speechrate(nsyll/dur): 0.2694
articulation_rate(nsyll/phonationtime): 0.4380
ASD(speakingtime/nsyll): 0.2927


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



: 