In [13]:
import pandas as pd
import re

In [14]:
def process_files(txt_path, csv_path):
    # Read the files
    with open(txt_path, 'r') as f:
        txt_lines = f.readlines()
    
    timing_df = pd.read_csv(csv_path)
    
    # Initialize results list
    results = []
    last_index = 0  # Track the last matched index to ensure sequential matching
    
    # Process lines
    for i in range(len(txt_lines)-1):  # Stop at second-to-last line to avoid index error
        current_line = txt_lines[i].strip()
        next_line = txt_lines[i+1].strip()
        
        # Find all content brackets in current line
        content_matches = re.finditer(r'\[(.*?)\]', current_line)
        # Find all instruction brackets in next line
        instruction_matches = re.finditer(r'\[(.*?):(.*?)\]', next_line)
        
        # Convert matches to lists so we can reuse them
        content_brackets = [m.group(1) for m in content_matches]
        instruction_brackets = [m.group(0) for m in instruction_matches]
        
        # Skip if either list is empty
        if not content_brackets or not instruction_brackets:
            continue
            
        # Process each non-empty content bracket
        for content in content_brackets:
            # Skip empty brackets or brackets with only whitespace
            if not content.strip():
                continue
                
            # Find first and last words of content (excluding brackets)
            words = content.strip().split()
            if not words:  # Skip if no words found
                continue
                
            first_word = words[0]
            last_word = words[-1].rstrip('.')
            
            # Process each instruction for this content
            for instruction in instruction_brackets:
                # Find timing in CSV
                try:
                    # Find the first occurrence of first_word *after* the last matched index
                    start_rows = timing_df.iloc[last_index:][timing_df['Word'].str.contains(first_word, na=False, regex=False)]
                    if not start_rows.empty:
                        start_index = start_rows.index[0]
                        start_time = start_rows['Start_time'].iloc[0]
                        last_index = start_index + 1  # Update last matched index
                    else:
                        continue
                    
                    # Find the first occurrence of last_word *after* the start_index
                    end_rows = timing_df.iloc[start_index:][timing_df['Word'].str.contains(last_word, na=False, regex=False)]
                    if not end_rows.empty:
                        end_time = end_rows['End_time'].iloc[0]
                    else:
                        continue
                    
                    # Add to results
                    results.append({
                        'bracketed_text': instruction,
                        'start_time': start_time,
                        'end_time': end_time,
                        'content_reference': f"[{content}]"  # Add content for reference
                    })
                except (IndexError, KeyError) as e:
                    print(f"Warning: Could not find timing for '{first_word}' or '{last_word}' - Error: {e}")
    
    # Create output DataFrame
    output_df = pd.DataFrame(results)
    if not output_df.empty:
        output_df.index = output_df.index + 1  # Start index from 1
    return output_df

In [15]:
# Example usage
if __name__ == "__main__":
    output_df = process_files('boat-gest-coded.txt', 'Boat-Design-Matrix.csv')
    output_df.to_csv('output.csv')

  end_rows = timing_df.iloc[start_index:][timing_df['Word'].str.contains(last_word, na=False, regex=False)]
  start_rows = timing_df.iloc[last_index:][timing_df['Word'].str.contains(first_word, na=False, regex=False)]
