# Notebook to generate the words onset from the sentences

First, download all the text grids, using the df that has all the sentences information (textual), and the audio files linked to each sentence.

To get a textgrid, you need to input both the sentence text, and the audio file.

## Setting up the necessary files for the API calls

In [None]:
# Get all the sentences
from pathlib import Path
import mne
import pandas as pd
import numpy as np

data_path = Path('/home/co/data/MindSentences/le_240431/241210')

dataset_file = data_path / 'final_dataset.csv'

df = pd.read_csv(dataset_file)

In [None]:
# First: rename all the .wav files in the audio folder into another folder called audio_mp3

audio_path = data_path / 'audio'
audio_mp3_path = data_path / 'audio_mp3'
audio_mp3_path.mkdir(exist_ok=True)

for audio_file in audio_path.glob('*.wav'):
    audio_file.rename(audio_mp3_path / audio_file.name.replace('.wav', '.mp3'))

# Second: create a new column in the dataframe with the path to the audio file

df['audio_path'] = df['audio_filename'].apply(lambda x: str(audio_mp3_path / x.replace('.wav', '.mp3')))
df

Unnamed: 0,tense,dataset,theme,num_words,structure,audio_filename,sentence_id,numerosity,sentence,audio_path
0,future,naturalistic,emotion,7,simple,nat_00282.wav,nat_00282,plural,The kids will find joy in games,/home/co/data/MindSentences/le_240431/241210/a...
1,future,naturalistic,relationship,11,preposition,nat_02008.wav,nat_02008,singular,Will your cousin not be at the museum with us ...,/home/co/data/MindSentences/le_240431/241210/a...
2,present,naturalistic,food,5,simple,nat_01713.wav,nat_01713,singular,The bread isn't baked yet,/home/co/data/MindSentences/le_240431/241210/a...
3,present,naturalistic,humanity,9,preposition,nat_01666.wav,nat_01666,singular,Study the painting with a guide from the gallery,/home/co/data/MindSentences/le_240431/241210/a...
4,past,naturalistic,health,7,independent,nat_02066.wav,nat_02066,singular,I wasn't aware there was therapy available,/home/co/data/MindSentences/le_240431/241210/a...
...,...,...,...,...,...,...,...,...,...,...
3275,past,controlled,transport,10,standard_object_c,ctrl_01195.wav,ctrl_01195,singular,The enthusiastic child that everyone observed ...,/home/co/data/MindSentences/le_240431/241210/a...
3276,past,controlled,transport,13,nested,ctrl_01196.wav,ctrl_01196,singular,The child who told a story that entertained th...,/home/co/data/MindSentences/le_240431/241210/a...
3277,past,controlled,transport,14,nested_a,ctrl_01197.wav,ctrl_01197,singular,The child who narrated a story that entertaine...,/home/co/data/MindSentences/le_240431/241210/a...
3278,past,controlled,transport,14,nested_b,ctrl_01198.wav,ctrl_01198,singular,The eager child who told a story that amused t...,/home/co/data/MindSentences/le_240431/241210/a...


## Code to download all the textgrids

Code to run beforehand in order to get all the textgrids using the WebMausAPI directly, instead of doing it by hand 

In [None]:
import requests
from pathlib import Path
from pydub import AudioSegment
import tempfile
import os
import re

# For each sentence, send it to MAUS and get the alignment for each word. 
# The final goal is to have a dataframe with for each sentence: its word starts and durations

import xml.etree.ElementTree as ET

def download_textgrid(download_link):
    try:
        response = requests.get(download_link)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error downloading TextGrid: {str(e)}")
        raise

def parse_textgrid(textgrid_content):
    intervals = []
    interval_pattern = r'intervals \[\d+\]:\n\s*xmin = ([0-9.]+)\n\s*xmax = ([0-9.]+)\n\s*text = "(.*?)"'
    matches = re.findall(interval_pattern, textgrid_content, re.DOTALL)
    for match in matches:
        start, end, text = match
        intervals.append({'start': float(start), 'end': float(end), 'text': text})
    return intervals

def test_webmaus_call(original_path, text):
    url = "https://clarin.phonetik.uni-muenchen.de/BASWebServices/services/runMAUSBasic"
    temp_txt = Path('temp.txt')
    temp_wav_path = None
    files = {}
    
    try:

        cleaned_text = text.replace("'", "'")
        temp_txt.write_text(cleaned_text)
        
        files = {
            'SIGNAL': open(original_path, 'rb'),
            'TEXT': open(temp_txt, 'rb')
        }
        
        params = {
            'LANGUAGE': 'eng-US',
            'OUTFORMAT': 'TextGrid'
        }
        
        response = requests.post(url, files=files, data=params, timeout=30)
        
        if response.content.startswith(b'<'):
            root = ET.fromstring(response.content)
            success = root.find('success').text
            if success == 'false':
                error_msg = response.content.decode()
                raise Exception(f"WebMAUS processing failed: {error_msg}")
            
            download_link = root.find('downloadLink').text
            if download_link:
                return download_link
        
        return response.content.decode()
            
    except Exception as e:
        raise
    
    finally:
        if temp_txt.exists():
            temp_txt.unlink()
        
        if temp_wav_path and os.path.exists(temp_wav_path):
            os.remove(temp_wav_path)
        
        for f in files.values():
            try:
                f.close()
            except:
                pass

# Process each sentence in the dataframe
results = []

# Test:
df_test = df.head(2)
for index, row in df.iterrows():
# for index, row in df_test.iterrows():
    audio_path = Path(row['audio_path'])
    text = row['sentence']
    
    try:
        result_link = test_webmaus_call(audio_path, text)
        if result_link:
            textgrid_content = download_textgrid(result_link)
            intervals = parse_textgrid(textgrid_content)
            results.append({
                'sentence_id': row['sentence_id'],
                'intervals': intervals
            })
            # Save the textgrid content to a txt file
            with open(f"textgrid_{row['sentence_id']}.txt", "w") as f:
                f.write(textgrid_content)
    except Exception as e:
        print(f"Error processing sentence {row['sentence_id']}: {str(e)}")
        # Save the error message to a txt file
        with open(f"error_{row['sentence_id']}.txt", "w") as f:
            f.write(str(e))

# Create a new dataframe with the results
df_intervals = pd.DataFrame(results)
df_intervals

Unnamed: 0,sentence_id,intervals
0,nat_00282,"[{'start': 0.0, 'end': 0.05, 'text': ''}, {'st..."
1,nat_02008,"[{'start': 0.0, 'end': 0.07, 'text': ''}, {'st..."
2,nat_01713,"[{'start': 0.0, 'end': 0.03, 'text': ''}, {'st..."
3,nat_01666,"[{'start': 0.0, 'end': 0.14, 'text': ''}, {'st..."
4,nat_02066,"[{'start': 0.0, 'end': 0.06, 'text': ''}, {'st..."
...,...,...
3275,ctrl_01195,"[{'start': 0.0, 'end': 0.05, 'text': ''}, {'st..."
3276,ctrl_01196,"[{'start': 0.0, 'end': 0.03, 'text': ''}, {'st..."
3277,ctrl_01197,"[{'start': 0.0, 'end': 0.05, 'text': ''}, {'st..."
3278,ctrl_01198,"[{'start': 0.0, 'end': 0.05, 'text': ''}, {'st..."


## Clean the textgrids

In [None]:
import os
import shutil

# Create necessary directories
os.makedirs("original_textgrids", exist_ok=True)
os.makedirs("cleaned_textgrids", exist_ok=True)

# Move all existing TextGrid files to original_textgrids folder
for filename in os.listdir():
    if filename.endswith('.txt'):
        shutil.move(filename, os.path.join("original_textgrids", filename))

# Process each file in the original_textgrids folder
for filename in os.listdir("original_textgrids"):
    if filename.endswith('.txt'):
        with open(os.path.join("original_textgrids", filename), 'r', encoding='utf-8') as file:
            content = file.read()
            
        # Split the content into items
        items = content.split('item [')[1:]
        
        # Find the ORT-MAU item (should be the first one)
        ort_mau = None
        for item in items:
            if '"ORT-MAU"' in item:
                ort_mau = item
                break
                
        if ort_mau:
            # Extract all intervals
            intervals = []
            lines = ort_mau.split('\n')
            
            # Get header information
            header_lines = []
            for line in content.split('\n'):
                if 'item [' in line:
                    break
                header_lines.append(line)
                
            # Process intervals
            collecting_interval = False
            current_interval = []
            cleaned_intervals = []
            
            for line in lines:
                if 'intervals [' in line and not line.endswith('size'):
                    collecting_interval = True
                    current_interval = [line]
                elif collecting_interval:
                    current_interval.append(line)
                    if 'text =' in line:
                        text = line.split('=')[1].strip().strip('"')
                        if text and text != '""' and not text.startswith('<'):
                            cleaned_intervals.extend(current_interval)
                        collecting_interval = False
                        
            # Create new content
            new_content = '\n'.join(header_lines) + '\n'
            new_content += 'item []:\n    item [1]:\n'
            new_content += '        class = "IntervalTier"\n'
            new_content += '        name = "ORT-MAU"\n'
            new_content += f'        xmin = {content.split("xmin =")[1].split()[0]}\n'
            new_content += f'        xmax = {content.split("xmax =")[1].split()[0]}\n'
            new_content += f'        intervals: size = {len(cleaned_intervals) // 4}\n'
            new_content += '\n'.join('        ' + line for line in cleaned_intervals)
            
            # Write the cleaned content to a new file
            with open(os.path.join("cleaned_textgrids", filename), 'w', encoding='utf-8') as file:
                file.write(new_content)

print("Processing complete!")

Processing complete!


In [None]:
def parse_textgrid(content):
    lines = content.split('\n')
    words = []
    in_item_1 = False
    current_interval = None
    
    for i, line in enumerate(lines):
        line = line.strip()
        
        # Start capturing when we hit item [1]
        if 'item [1]:' in line:
            in_item_1 = True
            continue
        
        # Stop capturing when we hit item [2]
        if 'item [2]:' in line:
            in_item_1 = False
            break
            
        if in_item_1:
            if 'intervals [' in line:
                current_interval = {}
            elif 'xmin =' in line:
                current_interval['start'] = float(line.split('=')[1].strip())
            elif 'xmax =' in line:
                current_interval['end'] = float(line.split('=')[1].strip())
            elif 'text =' in line:
                text = line.split('=')[1].strip().strip('"')
                current_interval['text'] = text
                words.append(current_interval)
                current_interval = None
    
    import pandas as pd
    df = pd.DataFrame([(word['start'], word['end'], word['text']) 
                      for word in words],
                     columns=['start', 'end', 'text'])
    return df



## Transforming the new df

In [39]:
import pandas as pd
import glob
from pathlib import Path
import re
import os

def parse_textgrid_for_words(textgrid_content):
    # Extract word-level information from the cleaned TextGrids
    word_pattern = r'intervals \[\d+\]:\s*xmin = ([0-9.]+)\s*xmax = ([0-9.]+)\s*text = "(.*?)"'
    words = []
    
    matches = re.finditer(word_pattern, textgrid_content, re.DOTALL)
    for match in matches:
        start, end, text = match.groups()
        if text.strip():  # Only include non-empty intervals
            words.append({
                'start': float(start),
                'end': float(end),
                'text': text.strip()
            })
    return words

def create_combined_dataframe(original_df, textgrid_directory):
    rows = []
    
    # First, add all sentences
    for idx, row in original_df.iterrows():
        # Add sentence-level entry
        rows.append({
            'type': 'Sentence',
            'sentence': row['sentence'],
            'text': row['sentence'],
            'start': 0,
            'end': None,  # Will be filled with the end time of the last word
            'sequence_id': idx,
            'audio_path': row['audio_path'],
            'sentence_id': row['sentence_id']
        })
        
        # Read and parse corresponding TextGrid file from the cleaned directory
        textgrid_file = f"textgrid_{row['sentence_id']}.txt"
        textgrid_path = Path(textgrid_directory) / 'cleaned_textgrids' / textgrid_file
        
        if textgrid_path.exists():
            with open(textgrid_path, 'r', encoding='utf-8') as f:
                textgrid_content = f.read()
                
            # Parse words and their timings
            words = parse_textgrid_for_words(textgrid_content)
            
            # Add word-level entries
            for word in words:
                rows.append({
                    'type': 'Word',
                    'sentence': row['sentence'],
                    'text': word['text'],
                    'start': word['start'],
                    'end': word['end'],
                    'sequence_id': idx,
                    'audio_path': row['audio_path'],
                    'sentence_id': row['sentence_id']
                })
            
            # Update the sentence end time with the last word's end time
            if words:
                rows[len(rows) - len(words) - 1]['end'] = words[-1]['end']
    
    # Create DataFrame from all rows
    result_df = pd.DataFrame(rows)
    
    # Sort the DataFrame by sequence_id and start time
    result_df = result_df.sort_values(['sequence_id', 'start'])
    
    return result_df

# Specify the directory containing the TextGrid files
textgrid_directory = "."  # Replace with actual directory path if different

# Create the new DataFrame
new_df = create_combined_dataframe(df, textgrid_directory)

# Save the DataFrame to a CSV file
new_df.to_csv('combined_data.csv', index=False)

# Display the first few rows
print(new_df.head(10))

       type                                           sentence  \
0  Sentence                    The kids will find joy in games   
1      Word                    The kids will find joy in games   
2      Word                    The kids will find joy in games   
3      Word                    The kids will find joy in games   
4      Word                    The kids will find joy in games   
5      Word                    The kids will find joy in games   
6      Word                    The kids will find joy in games   
7      Word                    The kids will find joy in games   
8  Sentence  Will your cousin not be at the museum with us ...   
9      Word  Will your cousin not be at the museum with us ...   

                                                text  start    end  \
0                    The kids will find joy in games  0.000  1.910   
1                                                The  0.050  0.110   
2                                               kids  0.110  0.

In [42]:
new_df.sequence_id.unique().shape

(3280,)

In [43]:
new_df

Unnamed: 0,type,sentence,text,start,end,sequence_id,audio_path,sentence_id
0,Sentence,The kids will find joy in games,The kids will find joy in games,0.000,1.910,0,/home/co/data/MindSentences/le_240431/241210/a...,nat_00282
1,Word,The kids will find joy in games,The,0.050,0.110,0,/home/co/data/MindSentences/le_240431/241210/a...,nat_00282
2,Word,The kids will find joy in games,kids,0.110,0.455,0,/home/co/data/MindSentences/le_240431/241210/a...,nat_00282
3,Word,The kids will find joy in games,will,0.455,0.560,0,/home/co/data/MindSentences/le_240431/241210/a...,nat_00282
4,Word,The kids will find joy in games,find,0.560,0.810,0,/home/co/data/MindSentences/le_240431/241210/a...,nat_00282
...,...,...,...,...,...,...,...,...
31041,Word,The eager child who told a story that amused t...,passengers,2.780,3.510,3279,/home/co/data/MindSentences/le_240431/241210/a...,ctrl_01199
31042,Word,The eager child who told a story that amused t...,quickly,3.720,4.100,3279,/home/co/data/MindSentences/le_240431/241210/a...,ctrl_01199
31043,Word,The eager child who told a story that amused t...,boarded,4.100,4.490,3279,/home/co/data/MindSentences/le_240431/241210/a...,ctrl_01199
31044,Word,The eager child who told a story that amused t...,the,4.490,4.550,3279,/home/co/data/MindSentences/le_240431/241210/a...,ctrl_01199
