In [3]:
import re
import textwrap

def clean_webvtt_transcription(text, wrap_length=None):
    # Split the text into lines and remove the first line "WEBVTT"
    lines = text.split('\n')[1:]

    cleaned_text = ""
    current_speaker = None
    current_line = ""

    for line in lines:
        # Skip empty lines and timestamp lines
        if line.strip() == "" or re.match(r'\d+\n?$', line) or "-->" in line:
            continue

        # Check if the line starts with a speaker name
        match = re.match(r'(\w+ [\w\s]+): (.*)', line)
        if match:
            speaker, content = match.groups()
            if speaker == current_speaker:
                # Continue the sentence for the same speaker
                current_line += " " + content
            else:
                # Finish the sentence for the previous speaker and start a new one
                if current_speaker:
                    if wrap_length:
                        wrapped_text = textwrap.fill(current_speaker + ": " + current_line.strip(), wrap_length)
                        cleaned_text += wrapped_text + '\n'
                    else:
                        cleaned_text += current_speaker + ": " + current_line.strip() + '\n'
                current_speaker = speaker
                current_line = content.capitalize() + " "
        else:
            # Continue the sentence if no speaker is found
            current_line += " " + line

    # Add the last line
    if current_speaker:
        if wrap_length:
            wrapped_text = textwrap.fill(current_speaker + ": " + current_line.strip(), wrap_length)
            cleaned_text += wrapped_text
        else:
            cleaned_text += current_speaker + ": " + current_line.strip()

    # Post-processing: replace ' i ' with ' I '
    cleaned_text = re.sub(r'\bi\b', 'I', cleaned_text)

    # Post-processing: remove extra spaces
    cleaned_text = re.sub(r' +', ' ', cleaned_text)

    return cleaned_text

# Example usage
input_text = """
WEBVTT

1
00:00:04.410 --> 00:00:04.890
Eric Rosenbaum: Cool.

2
00:00:06.330 --> 00:00:07.470
And

3
00:00:11.969 --> 00:00:12.809
Timothy Sackton: OK, so

4
00:00:14.130 --> 00:00:17.609
Timothy Sackton: We begin tonight out the sun is setting over ravens

5
00:00:19.980 --> 00:00:38.760
Timothy Sackton: You earlier this afternoon, like five sessions ago you left your horses in gear in the narrow River Valley and made your way by secret passages and waiting rivers through the mountains to the abandoned Durban outpost.

6
00:00:40.410 --> 00:00:48.840
Timothy Sackton: Near that done, Mary stronghold of ravens holes there after encountering in a car and her wild or the identified Kim.
"""

wrap_length = 120  # Example wrap length
cleaned = clean_webvtt_transcription(input_text, wrap_length)
print(cleaned)


Eric Rosenbaum: Cool.  And
Timothy Sackton: Ok, so  We begin tonight out the sun is setting over ravens You earlier this afternoon, like five
sessions ago you left your horses in gear in the narrow River Valley and made your way by secret passages and waiting
rivers through the mountains to the abandoned Durban outpost. Near that done, Mary stronghold of ravens holes there
after encountering in a car and her wild or the identified Kim.


In [4]:
import json

text = """
David;Kenzo or Wellby
David (Wellby);Wellby
David (he/him);Kenzo or Wellby
David Kong;Kenzo
David Kong (he/him) Organizer, MIT Media Lab;Kenzo
David Schwartz;Wellby
Eric "Seeker" Rosenbaum;Seeker
Eric Rosenbaum;Seeker
John Leeker;Drikod
John Leeker (he/him/his);Drikod
Kate;Riswynn
Kate Sackton;Riswynn
Library and Archives;Drikod
Mike;Dewalth
Mike (Delwath);Dewalth
Mike Sackton;Dewalth
Rev. David;Wellby
Rev. David (he/him);Wellby
Rev. David Schwartz (he/him);Wellby
Riswynn;Riswynn
Schwartz;Wellby
Seeker;Seeker
Tim Sackton;DM
Timothy Sackton;DM
UU Church of Boulder;Wellby
ericrosenbaum;Seeker
kate;Riswynn
"""

# Convert text to a dictionary
mapping_dict = {}
for line in text.strip().split('\n'):
    original, alias = line.split(';')
    mapping_dict[original.strip()] = alias.strip()

# Convert the dictionary to JSON
json_data = json.dumps(mapping_dict, indent=4)
print(json_data)

{
    "David": "Kenzo or Wellby",
    "David (Wellby)": "Wellby",
    "David (he/him)": "Kenzo or Wellby",
    "David Kong": "Kenzo",
    "David Kong (he/him) Organizer, MIT Media Lab": "Kenzo",
    "David Schwartz": "Wellby",
    "Eric \"Seeker\" Rosenbaum": "Seeker",
    "Eric Rosenbaum": "Seeker",
    "John Leeker": "Drikod",
    "John Leeker (he/him/his)": "Drikod",
    "Kate": "Riswynn",
    "Kate Sackton": "Riswynn",
    "Library and Archives": "Drikod",
    "Mike": "Dewalth",
    "Mike (Delwath)": "Dewalth",
    "Mike Sackton": "Dewalth",
    "Rev. David": "Wellby",
    "Rev. David (he/him)": "Wellby",
    "Rev. David Schwartz (he/him)": "Wellby",
    "Riswynn": "Riswynn",
    "Schwartz": "Wellby",
    "Seeker": "Seeker",
    "Tim Sackton": "DM",
    "Timothy Sackton": "DM",
    "UU Church of Boulder": "Wellby",
    "ericrosenbaum": "Seeker",
    "kate": "Riswynn"
}
