In [40]:
import os
import glob
import numpy as np
import pandas as pd
import opensmile
import textgrid
from pydub import AudioSegment

def extract_base_filename(filename):
    """Extract the base filename without the extension and "_interview_1" suffix"""
    base = os.path.basename(filename)
    base = os.path.splitext(base)[0]  # Remove extension
    
    # Remove "_interview_1" suffix
    if "_interview_1" in base:
        base = base.replace("_interview_1", "")
    
    return base

def find_matching_textgrid(wav_filename, textgrid_files):
    """Find the matching TextGrid file for a wav file"""
    base_wav = extract_base_filename(wav_filename)
    
    for tg_file in textgrid_files:
        base_tg = extract_base_filename(tg_file)
        if base_wav == base_tg:
            return tg_file
    
    return None

def extract_ius_from_textgrid(textgrid_path):
    """Extract Intonational Units from a TextGrid file"""
    try:
        tg = textgrid.TextGrid.fromFile(textgrid_path)
        
        # Find the tier containing IUs
        iu_tier = None
        for tier in tg:
            # Check if this tier contains IU annotations
            # You might need to adjust this based on your specific TextGrid structure
            if "IU" in tier.name:
                iu_tier = tier
                break
        
        # If no IU tier was found, try to find any tier that might contain IUs
        if iu_tier is None:
            for tier in tg:
                # Look for any tier that might contain text annotations
                if len(tier) > 0:
                    iu_tier = tier
                    break
        
        if iu_tier is None:
            print(f"Warning: No suitable tier found in {textgrid_path}")
            return []
        
        ius = []
        for interval in iu_tier:
            # Only consider intervals with text (non-empty)
            if interval.mark.strip():
                ius.append({
                    'start': interval.minTime,
                    'end': interval.maxTime,
                    'text': interval.mark
                })
        
        return ius
    
    except Exception as e:
        print(f"Error processing TextGrid file {textgrid_path}: {e}")
        return []

def extract_audio_segment(wav_path, start_time, end_time, temp_path="temp_segment.wav"):
    """Extract a segment from a wav file and save it to a temporary file"""
    try:
        audio = AudioSegment.from_wav(wav_path)
        
        # Convert times from seconds to milliseconds
        start_ms = int(start_time * 1000)
        end_ms = int(end_time * 1000)
        
        # Extract segment
        segment = audio[start_ms:end_ms]
        
        # Export to temporary file
        segment.export(temp_path, format="wav")
        
        return temp_path
    
    except Exception as e:
        print(f"Error extracting audio segment from {wav_path}: {e}")
        return None

def process_file_with_opensmile(wav_path, textgrid_path, output_dir):
    """Process a wav file and its TextGrid with OpenSmile"""
    print(f"Processing {wav_path} with {textgrid_path}")
    
    # Extract IUs from TextGrid
    ius = extract_ius_from_textgrid(textgrid_path)
    
    if not ius:
        print(f"No IUs found in {textgrid_path}")
        return
    
    # Initialize OpenSmile
    # Using ComParE_2016 feature set as an example - you can change this based on your needs
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.ComParE_2016,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
    
    # Create base filename for output and for the filename column
    base_filename = extract_base_filename(wav_path)
    
    # Create dataframe to store all IU features
    all_features = []
    
    # Process each IU
    for i, iu in enumerate(ius):
        try:
            # Extract audio segment for this IU
            temp_path = extract_audio_segment(wav_path, iu['start'], iu['end'])
            
            if temp_path:
                # Extract features using OpenSmile
                features = smile.process_file(temp_path)
                
                # Add metadata
                features['filename'] = base_filename  # Add the filename to each row
                features['iu_index'] = i
                features['iu_start'] = iu['start']
                features['iu_end'] = iu['end']
                features['iu_text'] = iu['text']
                
                all_features.append(features)
                
                # Remove temporary file
                os.remove(temp_path)
        
        except Exception as e:
            print(f"Error processing IU {i} in {wav_path}: {e}")
    
    if all_features:
        # Combine all features into a single dataframe
        result_df = pd.concat(all_features, ignore_index=True)
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Save features to CSV
        output_path = os.path.join(output_dir, f"{base_filename}_features.csv")
        result_df.to_csv(output_path, index=False)
        
        print(f"Saved features for {len(all_features)} IUs to {output_path}")
        return result_df
    else:
        print(f"No features extracted for {wav_path}")
        return None

def main():
    # Set up paths
    wav_dir = "data/wav"
    textgrid_dir = "data/textgrid"
    output_dir = "output_features"
    
    # Get all wav and TextGrid files
    wav_files = glob.glob(os.path.join(wav_dir, "*.wav"))
    textgrid_files = glob.glob(os.path.join(textgrid_dir, "*.TextGrid"))
    
    print(f"Found {len(wav_files)} WAV files and {len(textgrid_files)} TextGrid files")
    
    # List to store all dataframes for potential combined output
    all_dfs = []
    
    # Process each wav file
    for wav_file in wav_files:
        # Find matching TextGrid file
        textgrid_file = find_matching_textgrid(wav_file, textgrid_files)
        
        if textgrid_file:
            df = process_file_with_opensmile(wav_file, textgrid_file, output_dir)
            if df is not None:
                all_dfs.append(df)
        else:
            print(f"No matching TextGrid file found for {wav_file}")
    
    # Optionally combine all features into a single file
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        combined_output_path = os.path.join(output_dir, "all_features_combined.csv")
        combined_df.to_csv(combined_output_path, index=False)
        print(f"Saved combined features for all files to {combined_output_path}")

if __name__ == "__main__":
    main()


Found 30 WAV files and 34 TextGrid files
Processing data/wav/VF20B_English_I2_20181203_interview_1.wav with data/textgrid/VF20B_English_I2_20181203.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 603 IUs to output_features/VF20B_English_I2_20181203_features.csv
No matching TextGrid file found for data/wav/VF19B_Cantonese_I2_20190213_interview_1.wav
No matching TextGrid file found for data/wav/VF23B_Cantonese_I2_20190121_interview_1.wav
Processing data/wav/VF19A_English_I1_20181114_interview_1.wav with data/textgrid/VF19A_English_I1_20181114.TextGrid
Saved features for 676 IUs to output_features/VF19A_English_I1_20181114_features.csv
Processing data/wav/VF22A_English_I2_20181206_interview_1.wav with data/textgrid/VF22A_English_I2_20181206.TextGrid
Saved features for 793 IUs to output_features/VF22A_English_I2_20181206_features.csv
No matching TextGrid file found for data/wav/VF21D_Cantonese_I2_20190306_interview_1.wav
No matching TextGrid file found for data/wav/VF22A_Cantonese_I1_20181206_interview_1.wav
Processing data/wav/VF21A_English_I1_20190130_interview_1.wav with data/textgrid/VF21A_English_I1_20190130.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 728 IUs to output_features/VF21A_English_I1_20190130_features.csv
No matching TextGrid file found for data/wav/VF20B_Cantonese_I1_20181203_interview_1.wav
No matching TextGrid file found for data/wav/VF19D_Cantonese_I1_20190308_interview_1.wav
No matching TextGrid file found for data/wav/VF21B_Cantonese_I1_20190204_interview_1.wav
No matching TextGrid file found for data/wav/VF27A_Cantonese_I2_20181120_interview_1.wav
No matching TextGrid file found for data/wav/VF19A_Cantonese_I2_20181114_interview_1.wav
No matching TextGrid file found for data/wav/VF20A_Cantonese_I1_20181119_interview_1.wav
No matching TextGrid file found for data/wav/VF26A_Cantonese_I1_20190303_interview_1.wav
Processing data/wav/VF23B_English_I1_20190121_interview_1.wav with data/textgrid/VF23B_English_I1_20190121.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 656 IUs to output_features/VF23B_English_I1_20190121_features.csv
Processing data/wav/VF19B_English_I1_20190213_interview_1.wav with data/textgrid/VF19B_English_I1_20190213.TextGrid
Saved features for 786 IUs to output_features/VF19B_English_I1_20190213_features.csv
Processing data/wav/VF19D_English_I2_20190308_interview_1.wav with data/textgrid/VF19D_English_I2_20190308.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 905 IUs to output_features/VF19D_English_I2_20190308_features.csv
Processing data/wav/VF21C_English_I2_20190211_interview_1.wav with data/textgrid/VF21C_English_I2_20190211.TextGrid
Saved features for 715 IUs to output_features/VF21C_English_I2_20190211_features.csv
No matching TextGrid file found for data/wav/VF19C_Cantonese_I2_20190224_interview_1.wav
Processing data/wav/VF27A_English_I1_20181120_interview_1.wav with data/textgrid/VF27A_English_I1_20181120.TextGrid
Saved features for 668 IUs to output_features/VF27A_English_I1_20181120_features.csv
No matching TextGrid file found for data/wav/VF21C_Cantonese_I1_20190211_interview_1.wav
Processing data/wav/VF19C_English_I1_20190224_interview_1.wav with data/textgrid/VF19C_English_I1_20190224.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 709 IUs to output_features/VF19C_English_I1_20190224_features.csv
Processing data/wav/VF26A_English_I2_20190303_interview_1.wav with data/textgrid/VF26A_English_I2_20190303.TextGrid
Saved features for 634 IUs to output_features/VF26A_English_I2_20190303_features.csv
Processing data/wav/VF23C_English_I2_20190128_interview_1.wav with data/textgrid/VF23C_English_I2_20190128.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 660 IUs to output_features/VF23C_English_I2_20190128_features.csv
No matching TextGrid file found for data/wav/VF23C_Cantonese_I1_20190128_interview_1.wav
Processing data/wav/VF20A_English_I2_20181119_interview_1.wav with data/textgrid/VF20A_English_I2_20181119.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 722 IUs to output_features/VF20A_English_I2_20181119_features.csv
No matching TextGrid file found for data/wav/VF21A_Cantonese_I2_20190130_interview_1.wav
Processing data/wav/VF21D_English_I1_20190306_interview_1.wav with data/textgrid/VF21D_English_I1_20190306.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 505 IUs to output_features/VF21D_English_I1_20190306_features.csv
Processing data/wav/VF21B_English_I2_20190204_interview_1.wav with data/textgrid/VF21B_English_I2_20190204.TextGrid


  result_df = pd.concat(all_features, ignore_index=True)


Saved features for 494 IUs to output_features/VF21B_English_I2_20190204_features.csv
Saved combined features for all files to output_features/all_features_combined.csv


In [42]:
import pandas as pd
import os

# Path to your output file
output_file = "output_features/all_features_combined.csv"

# Check if the file exists
if os.path.exists(output_file):
    # Read the CSV file
    features_df = pd.read_csv(output_file)
    
    # Display basic information about the dataframe
    print(f"Shape of the dataframe: {features_df.shape}")
    
    # Display the first few rows
    print("\nFirst 5 rows:")
    display(features_df.head())
    
    # Display column names
    print("\nColumn names:")
    print(features_df.columns.tolist())
    
    # Display descriptive statistics for numerical columns
    print("\nDescriptive statistics:")
    display(features_df.describe())
    
    # If you want to see data for a specific IU
    print("\nData for first IU:")
    display(features_df[features_df['iu_index'] == 0])
else:
    print(f"File not found: {output_file}")

Shape of the dataframe: (10254, 6378)

First 5 rows:


Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,filename,iu_index,iu_start,iu_end,iu_text
0,0.523388,0.654867,0.504425,0.106059,0.281874,0.408843,0.175815,0.126968,0.302784,0.090699,...,0.511749,72.081406,32.93193,74.554245,31.685667,VF20B_English_I2_20181203,0,9.28314,10.48314,i'm born in hong
1,0.447705,0.351351,0.027027,0.248553,0.347046,0.518599,0.098493,0.171554,0.270047,0.126326,...,0.41845,88.78105,36.390686,64.31477,54.586514,VF20B_English_I2_20181203,1,10.48314,10.92314,kong
2,0.51259,0.035088,0.671053,0.095562,0.114631,0.281715,0.019069,0.167084,0.186153,0.087001,...,0.514859,69.93348,27.784573,68.39632,35.42568,VF20B_English_I2_20181203,2,11.54314,13.89314,and i basically um
3,0.912861,0.084249,0.538462,0.245829,0.328931,0.520171,0.083102,0.191239,0.274342,0.109087,...,0.572198,98.96552,55.807526,99.25561,47.531578,VF20B_English_I2_20181203,3,14.22314,17.02314,spent my first eighteen years of my life in ho...
4,0.595579,0.415865,0.182692,0.11738,0.254393,0.388325,0.137013,0.133931,0.270944,0.091044,...,0.67223,77.2293,38.531116,67.34517,33.552578,VF20B_English_I2_20181203,4,18.07214,22.30214,and then i uh came to canada just for um ubc



Column names:
['audspec_lengthL1norm_sma_range', 'audspec_lengthL1norm_sma_maxPos', 'audspec_lengthL1norm_sma_minPos', 'audspec_lengthL1norm_sma_quartile1', 'audspec_lengthL1norm_sma_quartile2', 'audspec_lengthL1norm_sma_quartile3', 'audspec_lengthL1norm_sma_iqr1-2', 'audspec_lengthL1norm_sma_iqr2-3', 'audspec_lengthL1norm_sma_iqr1-3', 'audspec_lengthL1norm_sma_percentile1.0', 'audspec_lengthL1norm_sma_percentile99.0', 'audspec_lengthL1norm_sma_pctlrange0-1', 'audspec_lengthL1norm_sma_stddev', 'audspec_lengthL1norm_sma_skewness', 'audspec_lengthL1norm_sma_kurtosis', 'audspec_lengthL1norm_sma_meanSegLen', 'audspec_lengthL1norm_sma_maxSegLen', 'audspec_lengthL1norm_sma_minSegLen', 'audspec_lengthL1norm_sma_segLenStddev', 'audspec_lengthL1norm_sma_upleveltime25', 'audspec_lengthL1norm_sma_upleveltime50', 'audspec_lengthL1norm_sma_upleveltime75', 'audspec_lengthL1norm_sma_upleveltime90', 'audspec_lengthL1norm_sma_risetime', 'audspec_lengthL1norm_sma_leftctime', 'audspec_lengthL1norm_sma_l

Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,iu_index,iu_start,iu_end
count,10237.0,10237.0,10237.0,10237.0,10237.0,10237.0,10237.0,10237.0,10237.0,10237.0,...,10237.0,10237.0,10237.0,10237.0,10237.0,10237.0,10237.0,10254.0,10254.0,10254.0
mean,1.143384,0.422748,0.304915,0.290724,0.510185,0.765156,0.219461,0.254971,0.474432,0.101181,...,2.140473,-2.852712,0.449584,92.450828,41.409687,80.663194,44.826476,348.856641,805.80834,807.214359
std,0.68548,0.294186,0.315102,0.222418,0.334763,0.441106,0.188164,0.19873,0.325785,0.120582,...,0.881748,15.01435,0.184897,31.736506,21.057525,36.935473,21.814229,209.911251,454.448278,454.418043
min,0.0,0.0,0.0,0.001034,0.029718,0.030568,0.0,0.0,0.0,0.001034,...,-3.423907,-20.0,0.0,-67.69166,0.0,-334.92386,0.0,0.0,3.96632,4.22632
25%,0.676355,0.160221,0.0,0.15199,0.29802,0.474656,0.10486,0.12601,0.263758,0.06129,...,1.665545,-17.107994,0.37978,75.645905,29.34556,65.144104,33.058002,170.0,414.617037,416.316047
50%,1.012138,0.373494,0.215686,0.238874,0.433336,0.665829,0.172569,0.207033,0.400237,0.074259,...,2.069646,-6.594078,0.485983,91.16285,41.96102,83.854774,43.9194,341.0,807.6387,808.76083
75%,1.471065,0.666667,0.555556,0.360845,0.616993,0.935301,0.275112,0.328601,0.603825,0.097738,...,2.547116,14.425669,0.569231,108.18604,53.06935,100.17921,55.24068,514.0,1191.493315,1193.250157
max,6.706102,0.998462,0.997245,3.533777,3.817446,4.111929,2.362222,2.276336,3.552917,3.272013,...,11.601358,20.0,0.893552,340.93546,231.42694,318.10312,270.47122,904.0,1832.47139,1834.50075



Data for first IU:


Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,filename,iu_index,iu_start,iu_end,iu_text
0,0.523388,0.654867,0.504425,0.106059,0.281874,0.408843,0.175815,0.126968,0.302784,0.090699,...,0.511749,72.081406,32.93193,74.554245,31.685667,VF20B_English_I2_20181203,0,9.28314,10.48314,i'm born in hong
603,0.552892,0.605634,0.507042,0.13112,0.25458,0.402233,0.12346,0.147653,0.271113,0.053189,...,0.512881,93.98474,56.653587,65.43952,43.070736,VF19A_English_I1_20181114,0,12.12539,13.61539,i was born in vancouver
1279,1.070949,0.541667,0.0,0.282995,1.053799,1.121724,0.770804,0.067925,0.838729,0.09793,...,0.264905,44.208706,16.10371,24.350397,40.83968,VF22A_English_I2_20181206,0,13.43861,13.74861,um
2072,1.027482,0.196078,0.539216,0.229068,0.346431,0.89875,0.117363,0.55232,0.669682,0.111271,...,0.583918,65.44371,27.509354,67.42971,36.2442,VF21A_English_I1_20190130,0,10.20169,11.29169,so um
2800,1.305511,0.916667,0.0,0.115186,0.226287,0.966716,0.111101,0.740429,0.851529,0.091265,...,0.0,172.13202,0.0,-39.934025,86.28457,VF23B_English_I1_20190121,0,12.77234,12.96234,yeah
3456,0.366689,0.4,0.8,0.170355,0.279348,0.40355,0.108993,0.124202,0.233195,0.088707,...,0.0,163.89369,0.0,-94.62935,139.81934,VF19B_English_I1_20190213,0,9.06479,9.28479,okay
4242,0.199141,0.466667,0.0,0.100383,0.172411,0.205814,0.072029,0.033403,0.105431,0.034398,...,0.0,109.13168,0.0,19.49836,77.28618,VF19D_English_I2_20190308,0,8.14217,8.36217,mhmm
5147,1.758291,0.096386,0.012048,0.371939,0.475803,0.804107,0.103864,0.328304,0.432168,0.082664,...,0.817092,71.028915,37.120556,70.19013,56.56223,VF21C_English_I2_20190211,0,10.37489,11.27489,um
5862,0.343914,0.157895,0.421053,0.226603,0.283171,0.345779,0.056568,0.062608,0.119177,0.130888,...,0.0,200.17337,75.90952,148.62727,0.0,VF27A_English_I1_20181120,0,3.96632,4.22632,okay
6530,0.520144,0.611111,0.333333,0.132965,0.309773,0.481834,0.176808,0.172061,0.348869,0.100051,...,0.393365,102.67868,32.786255,88.216415,56.592186,VF19C_English_I1_20190224,0,4.00743,4.43743,mmm
