# Preprocess and Parse Transcripts into Sentences

Transcripts for House of Representative Hearings were scraped from https://www.govinfo.gov/app/collection/chrg/ for Congress years 112-118.

Scraped txt transcripts are saved in input_path. Each txt contains the entire transcript of the hearing. Now, the task is to get each sentence and who said it. Only need for house of reps, not witnesses or other speakers.

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive', force_remount=True)
pd.set_option('display.max_colwidth', None)
input_path = '/content/drive/MyDrive/ANLP Project/Final_Version/Data/Transcript_Scraped/'
output_path = '/content/drive/MyDrive/ANLP Project/Final_Version/Data/'

Mounted at /content/drive


In [None]:
# imports
import re
import os

Parse all transcripts to get dialogue blocks and their speaker. Each transcript has slightly different format so need to manually check different txts to account for all formats and edge cases. Function will output cols `first_name`, `last_name`, `state`, `dialogue`.

In [None]:
"""
House of Representatives Hearing Transcript Parser

This script parses House hearing transcripts in .txt format to extract dialogue
from House Representatives only (excluding witnesses and staff).

Output: DataFrame with columns: first_name, last_name, state, dialogue
"""

def parse_committee_members(text):
    """
    Extract committee members with their names and states.
    Returns a dictionary mapping last names to member info.
    Also returns the chair's last name if identifiable.
    """
    members = {}
    chair_last_name = None

    # US States list for validation
    us_states = {
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
        'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
        'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
        'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
        'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
        'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
        'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
        'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
        'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
        'West Virginia', 'Wisconsin', 'Wyoming',
        'AL', 'AK', 'AZ', 'AR', 'CA', 'CO',
        'CT', 'DE', 'FL', 'GA', 'HI', 'ID',
        'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
        'ME', 'MD', 'MA', 'MI', 'MN',
        'MS', 'MO', 'MT', 'NE', 'NV',
        'NH', 'NJ', 'NM', 'NY',
        'NC', 'ND', 'OH', 'OK', 'OR',
        'PA', 'RI', 'SC', 'SD',
        'TN', 'TX', 'UT', 'VT', 'VA', 'WA',
        'WV', 'WI', 'WY'
    }

    # Find all "COMMITTEE ON" or "HOUSE COMMITTEE ON" or "Subcommittee on" headers
    committee_header_pattern = r'(COMMITTEE ON|HOUSE COMMITTEE ON|Subcommittee on) ([^\n]+)\n'
    all_headers = list(re.finditer(committee_header_pattern, text, re.IGNORECASE))

    committee_text = None

    for header_match in all_headers:
        # Get text following this header (extended range)
        start_pos = header_match.end()
        check_text = text[start_pos:start_pos + 5000]  # Increased from 2000

        # Check if this section has actual member name patterns
        has_member_pattern = bool(re.search(r'[A-Z][A-Za-z\s\.\']+,\s*(?:' + '|'.join(us_states) + r')', check_text))

        if has_member_pattern:
            # Find the end by detecting when member patterns stop
            lines = check_text.split('\n')
            end_line_idx = 0
            consecutive_non_member_lines = 0

            for idx, line in enumerate(lines):
                line_stripped = line.strip()

                # Check if line contains a state name (indicating it might be a member line)
                has_state = any(state in line for state in us_states)

                # Check if line has reasonable length and content for a member entry
                has_letters = sum(1 for c in line_stripped if c.isalpha()) >= 3

                if has_state and has_letters:
                    # This looks like a member line, reset counter
                    consecutive_non_member_lines = 0
                    end_line_idx = idx + 1  # Update end position
                else:
                    # Check if this is a substantive non-member line (not just blank)
                    if len(line_stripped) > 10:
                        consecutive_non_member_lines += 1

                    # If we've seen 3+ consecutive non-member lines, we've left the list
                    if consecutive_non_member_lines >= 3:
                        break

            # Reconstruct committee_text up to the detected end
            committee_text = '\n'.join(lines[:end_line_idx])

            # Skip if text is too short (likely a title page, not actual member list)
            if len(committee_text) < 500:
                continue

        # Ensure we have text to parse
        if not committee_text:
            continue

        # Parse the member list
        lines = committee_text.split('\n')
        first_member_added_in_this_block = False # Track chair per-block

        for idx, line in enumerate(lines):
            line = line.strip()
            if not line or len(line) < 10:
                continue

            # Skip all-caps titles that are too long
            if line.isupper() and len(line) > 40:
                continue

            # Skip lines with obvious non-member content (check whole line first)
            line_lower = line.lower()
            line_skip_keywords = ['staff director', 'minority staff director', 'chief counsel',
                                  'pursuant to', 'subcommittee staff director']
            if any(keyword in line_lower for keyword in line_skip_keywords):
                continue

            # Handle two-column format by splitting on multiple spaces
            parts = re.split(r'\s{2,}', line)

            for part_idx, part in enumerate(parts):
                part = part.strip()
                if len(part) < 5:
                    continue

                # Skip parts with obvious non-member content
                part_lower = part.lower()
                skip_keywords = ['staff director', 'minority staff', 'chief counsel', 'pursuant to',
                               'clause', 'congress', 'session', 'government', 'graphic', 'available',
                               'vacant', 'emeritus', 'ranking member', 'vice chairman',
                               'http:', 'www.', 'printed for', 'phone:', 'fax:',
                               'mail:', 'internet:', 'bookstore', 'for sale by']

                # Skip if part is ONLY a position/title keyword (like "Emeritus" or "Chairman" alone)
                if part_lower in skip_keywords or part_lower in ['chairman', 'chairwoman', 'member']:
                    continue

                # Skip if contains these keywords
                if any(keyword in part_lower for keyword in skip_keywords):
                    continue

                # Look for state names in the part
                found_match = False
                for state in us_states:
                    # Look for state name in the part (case insensitive)
                    state_pattern = re.compile(re.escape(state), re.IGNORECASE)
                    state_match = state_pattern.search(part)

                    if state_match:
                        # Extract the portion before the state
                        name_portion = part[:state_match.start()].strip().rstrip(',')

                        # Make sure there are at least some letters in the name portion
                        letters = [c for c in name_portion if c.isalpha()]
                        if not letters or len(letters) < 3:
                            continue

                        # Additional check: name portion should not contain staff/director keywords
                        if any(keyword in name_portion.lower() for keyword in skip_keywords):
                            continue

                        # Try to extract the name before the state
                        # Pattern: NAME, State[, Position]

                        # Method 1: Try comma format "NAME, State" or "NAME, State, Position"
                        pattern1 = rf'^(.+?),\s*{re.escape(state)}'
                        match1 = re.search(pattern1, part, re.IGNORECASE)

                        if match1:
                            full_name = match1.group(1).strip()
                            # Clean up the name (remove titles, suffixes in parens, etc)
                            full_name = re.sub(r'\s*\([^)]*\)\s*$', '', full_name)  # Remove (ex officio)
                            full_name = re.sub(r',?\s*(Jr\.|Sr\.|III|II|IV)\.?\s*$', '', full_name)  # Remove Jr., III, etc

                            # Extract first and last name
                            name_parts = full_name.split()
                            if len(name_parts) >= 2:
                                # Filter out obvious non-name words
                                name_parts = [p for p in name_parts if p.upper() not in ['HON.', 'HON', 'THE']]

                                if len(name_parts) >= 2:
                                    first_name = name_parts[0]
                                    last_name = name_parts[-1]

                                    # Check if this person is chairman/chairwoman
                                    is_chair = bool(re.search(r',\s*Chairman(?:\s|$)', part, re.IGNORECASE))

                                    # Handle duplicate last names by storing as tuple key (last_name, state)
                                    key = last_name.upper()

                                    # Check if this last name already exists
                                    if key in members:
                                        if not isinstance(members[key].get('state'), tuple):
                                            existing = members[key]
                                            existing_state = existing['state'].upper()
                                            del members[key]
                                            members[(key, existing_state)] = existing

                                        members[(key, state.upper())] = {
                                            'first_name': first_name.title(),
                                            'last_name': last_name.title(),
                                            'state': state.title()
                                        }
                                    else:
                                        # First occurrence, use simple key
                                        members[key] = {
                                            'first_name': first_name.title(),
                                            'last_name': last_name.title(),
                                            'state': state.title()
                                        }

                                    # Track the chair
                                    current_key_ref = (key, state.upper()) if (key, state.upper()) in members else key
                                    if is_chair:
                                        chair_last_name = current_key_ref
                                    elif not first_member_added_in_this_block and chair_last_name is None:
                                        # First member as potential chair (fallback)
                                        chair_last_name = current_key_ref
                                        first_member_added_in_this_block = True

                                    found_match = True
                                    break

                        # Method 2: Try space format "NAME State" (less common)
                        if not found_match:
                            pattern2 = rf'^(.+?)\s+{re.escape(state)}(?:\s|$|,)'
                            match2 = re.search(pattern2, part, re.IGNORECASE)

                            if match2:
                                full_name = match2.group(1).strip()
                                # Clean up the name
                                full_name = re.sub(r'\s*\([^)]*\)\s*$', '', full_name)
                                full_name = re.sub(r',?\s*(Jr\.|Sr\.|III|II|IV)\.?\s*$', '', full_name)

                                # Extract first and last name
                                name_parts = full_name.split()
                                # Filter out obvious non-name words
                                name_parts = [p for p in name_parts if p.upper() not in ['HON.', 'HON', 'THE']]

                                if len(name_parts) >= 2:
                                    first_name = name_parts[0]
                                    last_name = name_parts[-1]

                                    # Check if this person is chairman/chairwoman
                                    is_chair = bool(re.search(rf'{re.escape(state)}\s+Chairman', part, re.IGNORECASE))

                                    # Handle duplicate last names by storing as tuple key (last_name, state)
                                    key = last_name.upper()

                                    # Check if this last name already exists
                                    if key in members:
                                        if not isinstance(members[key].get('state'), tuple):
                                            existing = members[key]
                                            existing_state = existing['state'].upper()
                                            del members[key]
                                            members[(key, existing_state)] = existing

                                        members[(key, state.upper())] = {
                                            'first_name': first_name.title(),
                                            'last_name': last_name.title(),
                                            'state': state.title()
                                        }
                                    else:
                                        # First occurrence, use simple key
                                        members[key] = {
                                            'first_name': first_name.title(),
                                            'last_name': last_name.title(),
                                            'state': state.title()
                                        }

                                    # Track the chair
                                    current_key_ref = (key, state.upper()) if (key, state.upper()) in members else key
                                    if is_chair:
                                        chair_last_name = current_key_ref
                                    elif not first_member_added_in_this_block and chair_last_name is None:
                                        # First member as potential chair (fallback)
                                        chair_last_name = current_key_ref
                                        first_member_added_in_this_block = True

                                    found_match = True
                                    break

                if found_match:
                    continue

        # Reset committee_text to None so we don't re-parse it
        committee_text = None

    return members, chair_last_name

def find_hearing_start(text):
    """
    Find where the actual hearing dialogue starts.
    FIXED: Now properly skips table of contents section AND handles spaces in speaker tags.
    Also skips TOC entries that look like speaker tags (e.g., "Mr. Smith........ 127")
    """

    # First, find and skip the table of contents section
    # Look for "C O N T E N T S" header
    toc_match = re.search(r'C\s+O\s+N\s+T\s+E\s+N\s+T\s+S', text, re.IGNORECASE)

    if toc_match:
        # Start searching after the TOC section
        # Look for the end of TOC by finding patterns like:
        # 1. A line of dashes followed by "House of Representatives"
        # 2. "OPENING STATEMENT OF HON."
        toc_end_patterns = [
            # Pattern for the standard hearing header block (most reliable)
            r'(?:^|\n)-{5,}\s*\n+(?:House of Representatives|Senate)',
            r'(?:^|\n)House of Representatives,\s*\n.*?Committee',
            r'(?:^|\n)OPENING STATEMENT OF HON\.',
        ]

        search_start = toc_match.end()
        earliest_toc_end = len(text)

        for pattern in toc_end_patterns:
            match = re.search(pattern, text[search_start:], re.MULTILINE | re.DOTALL)
            if match and (search_start + match.start()) < earliest_toc_end:
                earliest_toc_end = search_start + match.start()

        # Start our speaker search after the TOC
        search_start_pos = earliest_toc_end if earliest_toc_end < len(text) else search_start
    else:
        search_start_pos = 0

    # Now look for actual speaker patterns in the dialogue
    # Look for patterns like "Mr. Name." "Ms. Name." "Chairman Name." etc.
    # Updated to handle hyphenated names (Ros-Lehtinen), multi-word names (Van Hollen), and apostrophes (O'Brien)
    start_patterns = [
        r'(?:^|\n)Mr\.\s*[A-Za-z]+\.',
        r'(?:^|\n)Ms\.\s*[A-Za-z]+\.',
        r'(?:^|\n)Mrs\.\s*[A-Za-z]+\.',
        r'(?:^|\n)(?:The\s+)?(?:Chairman|Chairwoman)\.',
        # FIXED: Changed [A-Z][A-Za-z]+ to [\w\'\-]+ to support hyphenated names like Ros-Lehtinen
        r'(?:^|\n)(?:Chairman|Chairwoman)\s+[\w\'\-]+\.',
    ]

    earliest_pos = len(text)
    for pattern in start_patterns:
        match = re.search(pattern, text[search_start_pos:], re.MULTILINE)
        if match and (search_start_pos + match.start()) < earliest_pos:
            earliest_pos = search_start_pos + match.start()

    if earliest_pos < len(text):
        return earliest_pos

    return search_start_pos

def find_hearing_end(text):
    """
    NEW FUNCTION: Find where the hearing ends (at adjournment).
    This prevents APPENDIX sections and other post-hearing content from being parsed.

    Returns the position where the hearing ends, or len(text) if not found.
    """
    # Look for adjournment patterns
    # Common patterns:
    # - "committee will stand adjourned."
    # - "subcommittee stands adjourned."
    # - "[Whereupon, at X:XX a.m./p.m., the committee/subcommittee was adjourned.]"

    adjournment_patterns = [
        r'(?:committee|subcommittee)\s+(?:will\s+)?(?:stand|stands)\s+adjourned\.',
        r'\[Whereupon.*?adjourned\.\]',
    ]

    earliest_end = len(text)

    for pattern in adjournment_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match and match.end() < earliest_end:
            earliest_end = match.end()

    return earliest_end

def extract_dialogue(text, committee_members, chair_last_name=None):
    """
    Extract dialogue for committee members only.
    Returns a list of dictionaries with first_name, last_name, state, and dialogue.

    FIXED: Now stops parsing at adjournment to exclude APPENDIX sections.
    """
    dialogue_list = []

    # Find where hearing starts
    hearing_start = find_hearing_start(text)

    # NEW: Find where hearing ends (at adjournment)
    hearing_end = find_hearing_end(text)

    # Check if there's an "OPENING STATEMENT OF HON. NAME" right before the hearing start
    # This indicates who is actually chairing this hearing (important for subcommittee hearings)
    pre_start_text = text[max(0, hearing_start-300):hearing_start]
    # Pattern handles: McKINNEY, O'Brien, De-La-Cruz, etc.
    opening_statement_match = re.search(r'OPENING STATEMENT OF HON\.\s+([\w\.\s\'\'-]+),', pre_start_text, re.IGNORECASE)
    if opening_statement_match:
        # Extract the name from the opening statement
        name_from_opening = opening_statement_match.group(1).strip()
        # Get the last name (convert to uppercase for matching)
        name_parts = name_from_opening.split()
        if len(name_parts) >= 2:
            potential_chair = name_parts[-1].upper()
            if potential_chair in committee_members:
                chair_last_name = potential_chair
            else:
                # Check tuple keys
                for key in committee_members:
                    if isinstance(key, tuple) and key[0] == potential_chair:
                        chair_last_name = key
                        break

    # FIXED: Only use text from hearing_start to hearing_end
    text = text[hearing_start:hearing_end]

    # Find all speaker instances - need to handle multiple patterns:
    # 1. "The Chairman." or "The Chairwoman." (no name)
    # 2. "Chairman Camp." or "Chairwoman Smith." (with name)
    # 3. "Mr.Murphy." or "Ms.DeGette." (standard format - NO SPACE between title and name)
    # 4. "Mr. Murphy." or "Ms. DeGette." (standard format - WITH SPACE between title and name)
    # 5. "Mr.SCOTT of Georgia." or "Mr. SCOTT of Georgia." (with state for disambiguation)
    #
    # Name pattern handles: McKinley, O'Brien, De La Cruz, Van Hollen, etc.
    # Using [\w\'\-]+ to match word chars, apostrophes, and hyphens

    # Create pattern that captures:
    # - "The Chairman." or "The Chairwoman."
    # - "Chairman NAME." or "Chairwoman NAME."
    # - "Mr./Ms./Mrs./Dr. NAME [of STATE]." or "Mr.NAME." (no space between title and name)
    # Note: \s* allows 0 or more spaces (to handle both "Mr.Name" and "Mr. Name")
    speaker_pattern = r'(?:^|\n)(?:(The\s+)?(?:Chairman|Chairwoman)(?:\s+([\w\'\-]+))?|(?:Mr\.|Ms\.|Miss\.|Mrs\.|Dr\.)\s*([\w\'\-]+)(?:\s+of\s+([\w\s]+))?)\.'

    matches = list(re.finditer(speaker_pattern, text, re.MULTILINE | re.IGNORECASE))

    # First pass: identify the chair if we see "Chairman NAME." pattern
    if chair_last_name is None:
        for match in matches:
            chairman_name = match.group(2)
            if chairman_name:
                potential_chair = chairman_name.strip().upper()
                if potential_chair in committee_members:
                    chair_last_name = potential_chair
                    break
                else:
                    for key in committee_members:
                        if isinstance(key, tuple) and key[0] == potential_chair:
                            chair_last_name = key
                            break
                    if chair_last_name:
                        break

    for i, match in enumerate(matches):
        # Group 1: "The" prefix (optional)
        # Group 2: Name after Chairman/Chairwoman (optional)
        # Group 3: Name after Mr./Ms./Mrs./Dr.
        # Group 4: State after "of" (optional)

        has_the = match.group(1) is not None
        chairman_name = match.group(2)
        standard_name = match.group(3)
        state_qualifier = match.group(4)

        # Determine the speaker's last name (normalize to uppercase for matching)
        if standard_name:
            # Pattern like "Mr. Murphy." or "Ms. O'Brien." or "Mr. SCOTT of Georgia."
            last_name = standard_name.strip().upper()

            # If state qualifier is present, use it for disambiguation
            if state_qualifier:
                state_normalized = state_qualifier.strip().upper()
                # Try to find member with this last name and state
                tuple_key = (last_name, state_normalized)
                if tuple_key in committee_members:
                    last_name = tuple_key
                elif last_name in committee_members:
                    # Fallback to simple key if tuple not found
                    pass
                else:
                    # Can't find this member, skip
                    continue
            else:
                # No state qualifier - check if last_name exists as simple key or tuple
                if last_name not in committee_members:
                    # Check if there are tuple keys with this last name
                    found_keys = [k for k in committee_members if isinstance(k, tuple) and k[0] == last_name]
                    if len(found_keys) == 1:
                        # Only one member with this last name, use it
                        last_name = found_keys[0]
                    elif len(found_keys) > 1:
                        # Multiple members with same last name, can't disambiguate without state
                        # Skip this instance
                        continue
                    else:
                        # Not a committee member
                        continue

        elif chairman_name:
            # Pattern like "Chairman Murphy."
            last_name = chairman_name.strip().upper()

            # Check if exists
            if last_name not in committee_members:
                # Check tuple keys
                found_keys = [k for k in committee_members if isinstance(k, tuple) and k[0] == last_name]
                if len(found_keys) == 1:
                    last_name = found_keys[0]
                elif len(found_keys) > 1:
                    # Multiple members with same last name
                    continue
                else:
                    # Not a committee member
                    continue

        elif has_the:
            # Pattern like "The Chairman." or "The Chairwoman."
            # Use the identified chair
            if chair_last_name:
                last_name = chair_last_name
            else:
                # Try to infer from "OPENING STATEMENT OF" before this point
                pre_text = text[:match.start()]
                opening_match = re.search(r'OPENING STATEMENT OF HON\.\s+([\w\.\s\'\'-]+),', pre_text[-500:], re.IGNORECASE)
                if opening_match:
                    name_from_opening = opening_match.group(1).strip()
                    name_parts = name_from_opening.split()
                    if len(name_parts) >= 2:
                        potential_chair = name_parts[-1].upper()
                        if potential_chair in committee_members:
                            chair_last_name = potential_chair
                            last_name = chair_last_name
                        else:
                            # Check tuple keys
                            found_keys = [k for k in committee_members if isinstance(k, tuple) and k[0] == potential_chair]
                            if len(found_keys) == 1:
                                chair_last_name = found_keys[0]
                                last_name = chair_last_name
                            else:
                                continue
                    else:
                        # Still can't determine, skip
                        continue
                else:
                    # Still can't determine, skip
                    continue
        else:
            # Shouldn't happen, but skip if we can't determine the name
            continue

        # Check if this person is a committee member
        if last_name not in committee_members:
            continue

        member_info = committee_members[last_name]

        # Extract dialogue from this match to the next speaker OR to a section break
        start_pos = match.end()

        # Default end: next speaker
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(text)

        # Check for section breaks that should end the dialogue earlier
        dialogue_section = text[start_pos:end_pos]

        # Find section breaks: [The statement...], Prepared Statement of..., STATEMENT OF..., etc.
        section_break_patterns = [
            r'\[The statement.*?follows:?\]',
            r'\[The information.*?follows:?\]',
            r'\[.*?\]',
            r'^\[.*',
            r'Prepared Statement of',
            r'STATEMENT OF [A-Z]',
            r'TESTIMONY OF [A-Z]',
            r'OPENING STATEMENT OF [A-Z]'
        ]

        earliest_break = len(dialogue_section)
        for pattern in section_break_patterns:
            break_match = re.search(pattern, dialogue_section, re.IGNORECASE)
            if break_match and break_match.start() < earliest_break:
                earliest_break = break_match.start()

        # Update end position if we found a section break
        if earliest_break < len(dialogue_section):
            end_pos = start_pos + earliest_break

        dialogue = text[start_pos:end_pos]

        # Clean up the dialogue
        # Remove bracketed content
        dialogue = re.sub(r'\[GRAPHIC.*?\]', '', dialogue, flags=re.IGNORECASE)
        dialogue = re.sub(r'\[TIFF OMITTED\].*?(?=\n|$)', '', dialogue)
        dialogue = re.sub(r'\[The .*?\]', '', dialogue)
        dialogue = re.sub(r'\[Whereupon.*?\]', '', dialogue, flags=re.IGNORECASE)
        dialogue = re.sub(r'\[Recess.*?\]', '', dialogue, flags=re.IGNORECASE)
        dialogue = re.sub(r'\[.*?follows:?\]', '', dialogue, flags=re.IGNORECASE)
        dialogue = re.sub(r'\[.*?statement.*?\]', '', dialogue, flags=re.IGNORECASE)

        # Remove "Prepared Statement" sections
        dialogue = re.sub(
            r'Prepared Statement of.*?$',
            '',
            dialogue,
            flags=re.IGNORECASE | re.DOTALL
        )

        # Remove all-caps section headers
        dialogue = re.sub(
            r'(?:STATEMENT OF|TESTIMONY OF|OPENING STATEMENT OF).*?$',
            '',
            dialogue,
            flags=re.IGNORECASE | re.DOTALL
        )

        # Then catch any remaining multi-line all-caps blocks
        lines = dialogue.split('\n')
        cleaned_lines = []
        j = 0
        while j < len(lines):
            line = lines[j]

            # Check if this starts an all-caps block
            if len(line.strip()) > 20:
                # Calculate uppercase ratio
                letters = [c for c in line if c.isalpha()]
                if letters:
                    uppercase_ratio = sum(1 for c in letters if c.isupper()) / len(letters)

                    # If this line is >70% uppercase, check if it's part of a block
                    if uppercase_ratio > 0.7:
                        # Look ahead to see if next line(s) are also all-caps
                        block_size = 1
                        k = j + 1
                        while k < len(lines) and block_size < 5:
                            next_line = lines[k].strip()
                            if not next_line:
                                break
                            next_letters = [c for c in next_line if c.isalpha()]
                            if next_letters and len(next_line) > 15:
                                next_ratio = sum(1 for c in next_letters if c.isupper()) / len(next_letters)
                                if next_ratio > 0.7:
                                    block_size += 1
                                    k += 1
                                else:
                                    break
                            else:
                                k += 1

                        # If we found a multi-line all-caps block, skip it
                        if block_size >= 2:
                            j = k
                            continue

            cleaned_lines.append(line)
            j += 1

        dialogue = '\n'.join(cleaned_lines)

        # Remove page numbers
        dialogue = re.sub(r'\n\s*\d+\s*\n', ' ', dialogue)
        dialogue = re.sub(r'^\s*\d+\s*$', '', dialogue, flags=re.MULTILINE)

        # Remove excessive whitespace and newlines
        dialogue = re.sub(r'\n+', ' ', dialogue)
        dialogue = re.sub(r'\s+', ' ', dialogue)
        dialogue = dialogue.strip()

        # Final cleanup: Remove any remaining STATEMENT/TESTIMONY fragments
        dialogue = re.sub(r'\b(?:STATEMENT|TESTIMONY|OPENING\s+STATEMENT)\s+OF\s+.*$', '', dialogue, flags=re.IGNORECASE)
        dialogue = dialogue.strip()

        dialogue_list.append({
            'first_name': member_info['first_name'],
            'last_name': member_info['last_name'],
            'state': member_info['state'],
            'dialogue': dialogue
        })

    return dialogue_list


def process_hearing_transcript(file_path):
    """
    Main function to process a single hearing transcript file.
    Returns a DataFrame with dialogue data.
    """
    # Read the file
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()

    # Parse committee members
    committee_members, chair_last_name = parse_committee_members(text)
    # display(pd.DataFrame(committee_members))

    # Extract dialogue
    dialogue_data = extract_dialogue(text, committee_members, chair_last_name)

    # Create DataFrame
    df = pd.DataFrame(dialogue_data)

    # Update abbrev states to full state name
    abbr_to_full = {
        'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
        'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
        'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
        'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
        'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
        'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota',
        'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska',
        'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
        'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina',
        'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon',
        'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
        'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
        'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington',
        'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
    }

    if len(df) > 0:
      df['state'] = df['state'].str.upper().map(abbr_to_full).fillna(df['state'])
      df = df[~df['dialogue'].fillna('').str.fullmatch(r'\s*[^A-Za-z0-9]*\s*')] # remove row if empty

    return df


Get dialogue for all txt files.

In [None]:
# Get dialogue for all txt files
dfs = []
errors = []
error_links = []
total = 0

for dirpath, dirnames, filenames in os.walk(input_path):
    total += 1
    dirnames.sort()
    for filename in sorted(filenames):
        if filename.endswith('.txt') and not re.search(r'\(\d+\)\.txt$', filename):
            df = process_hearing_transcript(os.path.join(dirpath, filename))
            df['congress'] = int(dirpath.split('/')[-2].split('_')[0])
            df['committee'] = dirpath.split('/')[-1]
            df['file'] = filename
            dfs.append(df)

            # check that the files that parsed less than 30 sentences have incorrect data in source (not parsing issue)
            if df.size < 30:
              print(os.path.join(dirpath, filename), '     ', df.size)
              error_links.append(os.path.join(dirpath, filename))
              errors.append(df)

/content/drive/MyDrive/Courses/AU 25/INFO 256/ANLP Project/Data/Transcript_Scraped/Old_Scraped/112_txt/Committee_on_Appropriations/hearing_1.txt       0
/content/drive/MyDrive/Courses/AU 25/INFO 256/ANLP Project/Data/Transcript_Scraped/Old_Scraped/112_txt/Committee_on_Appropriations/hearing_10.txt       0
/content/drive/MyDrive/Courses/AU 25/INFO 256/ANLP Project/Data/Transcript_Scraped/Old_Scraped/112_txt/Committee_on_Appropriations/hearing_11.txt       0
/content/drive/MyDrive/Courses/AU 25/INFO 256/ANLP Project/Data/Transcript_Scraped/Old_Scraped/112_txt/Committee_on_Appropriations/hearing_12.txt       0
/content/drive/MyDrive/Courses/AU 25/INFO 256/ANLP Project/Data/Transcript_Scraped/Old_Scraped/112_txt/Committee_on_Appropriations/hearing_13.txt       0
/content/drive/MyDrive/Courses/AU 25/INFO 256/ANLP Project/Data/Transcript_Scraped/Old_Scraped/112_txt/Committee_on_Appropriations/hearing_14.txt       0
/content/drive/MyDrive/Courses/AU 25/INFO 256/ANLP Project/Data/Transcript_Sc

In [None]:
transcript = pd.concat(dfs)

At this point, each row has chunk of dialogue. We want each row to have one sentence.

In [None]:
def split_dialogue_to_sentences(df):
    """
    Split the sentences in each dialogue cell to its own row.
    All other column values remain the same for each sentence.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with a 'dialogue' column containing text with multiple sentences

    Returns:
    --------
    pandas.DataFrame
        New DataFrame where each sentence from dialogue is a separate row
    """
    # List to store the expanded rows
    expanded_rows = []

    # Iterate through each row in the dataframe
    for idx, row in df.iterrows():
        dialogue = row['dialogue']

        # Step 1: Protect common abbreviations by temporarily replacing them
        protected = dialogue
        abbreviations = [
            (r'\bMr\.', 'MR<DOT>'),
            (r'\bMrs\.', 'MRS<DOT>'),
            (r'\bMs\.', 'MS<DOT>'),
            (r'\bDr\.', 'DR<DOT>'),
            (r'\bProf\.', 'PROF<DOT>'),
            (r'\bSr\.', 'SR<DOT>'),
            (r'\bJr\.', 'JR<DOT>'),
            (r'\bSt\.', 'ST<DOT>'),
            (r'\bvs\.', 'VS<DOT>'),
            (r'\betc\.', 'ETC<DOT>'),
            (r'\bi\.e\.', 'IE<DOT>'),
            (r'\be\.g\.', 'EG<DOT>'),
        ]

        for pattern, replacement in abbreviations:
            protected = re.sub(pattern, replacement, protected)

        # Step 2: Split on sentence boundaries
        sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', protected)

        # Step 3: Restore the abbreviations
        sentences = [
            s.replace('MR<DOT>', 'Mr.')
             .replace('MRS<DOT>', 'Mrs.')
             .replace('MISS<DOT>', 'Miss.')
             .replace('MS<DOT>', 'Ms.')
             .replace('DR<DOT>', 'Dr.')
             .replace('PROF<DOT>', 'Prof.')
             .replace('SR<DOT>', 'Sr.')
             .replace('JR<DOT>', 'Jr.')
             .replace('ST<DOT>', 'St.')
             .replace('VS<DOT>', 'vs.')
             .replace('ETC<DOT>', 'etc.')
             .replace('IE<DOT>', 'i.e.')
             .replace('EG<DOT>', 'e.g.')
            for s in sentences
        ]

        # Clean up sentences (remove extra whitespace)
        sentences = [s.strip() for s in sentences if s.strip()]

        # Create a new row for each sentence
        for sentence in sentences:
            new_row = row.copy()
            new_row['dialogue'] = sentence
            expanded_rows.append(new_row)

    # Create new dataframe from expanded rows
    result_df = pd.DataFrame(expanded_rows)

    # Reset index
    result_df.reset_index(drop=True, inplace=True)

    return result_df

In [None]:
transcript_sentences = split_dialogue_to_sentences(transcript)

In [None]:
transcript_sentences.shape

(5910897, 7)

In [None]:
transcript_sentences.to_csv(output_path + 'transcripts_sentences.csv', index=False)