In [None]:
import re

# Function to clean up the transcription
def clean_transcription(transcription):
    # Pattern to match sentences that start with an incomplete word
    pattern = r"^[a-zA-Z]+\.?\s"
    cleaned_transcription = re.sub(pattern, "", transcription, 1)
    return cleaned_transcription.strip()

# Example transcriptions
transcriptions = [
    "nations. Only with truth comes healing and justice, and another step toward forming a more perfect union.",
    "example. This is another sentence that needs to be cleaned.",
    "incorrect. Here is yet another one."
    "exacerbating the global food crisis that hit developing nations in Africa especially hard. Instead, the United States"
]

# Cleaned transcriptions
cleaned_transcriptions = [clean_transcription(t) for t in transcriptions]

# Print results
for original, cleaned in zip(transcriptions, cleaned_transcriptions):
    print(f"Original: {original}\nCleaned: {cleaned}\n")


Original: nations. Only with truth comes healing and justice, and another step toward forming a more perfect union.
Cleaned: Only with truth comes healing and justice, and another step toward forming a more perfect union.

Original: example. This is another sentence that needs to be cleaned.
Cleaned: This is another sentence that needs to be cleaned.

Original: incorrect. Here is yet another one.exacerbating the global food crisis that hit developing nations in Africa especially hard. Instead, the United States
Cleaned: Here is yet another one.exacerbating the global food crisis that hit developing nations in Africa especially hard. Instead, the United States



In [None]:
pip install pandas openpyxl spacy



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to determine if a sentence is a fragment
def is_fragment(sentence):
    # Simple heuristic: consider a sentence a fragment if it's too short or doesn't end with proper punctuation
    min_length = 5
    return len(sentence) < min_length or not sentence[-1].is_punct

# Function to clean transcription by removing fragments
def clean_transcription(transcription):
    doc = nlp(transcription)
    sentences = list(doc.sents)

    meaningful_sentences = [sent.text.strip() for sent in sentences if not is_fragment(sent)]

    # Join the meaningful sentences back into a cleaned transcription
    cleaned_transcription = " ".join(meaningful_sentences)
    return cleaned_transcription

# Check and handle NaN values, convert all entries to strings
correct_column_name = "transcription"  # Replace 'transcription' with the actual column name
df[correct_column_name] = df[correct_column_name].fillna('').astype(str)

# Apply the cleaning function
df['Cleaned_Transcription'] = df[correct_column_name].apply(clean_transcription)

# Save the cleaned transcriptions to a new Excel file
output_file_path = "cleaned_transcriptions11.xlsx"
df.to_excel(output_file_path, index=False)

# Download the cleaned Excel file
files.download(output_file_path)

print("Transcriptions cleaned and saved to:", output_file_path)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Transcriptions cleaned and saved to: cleaned_transcriptions11.xlsx


In [None]:
from google.colab import files

# Upload the file
uploaded = files.upload()

# Get the filename from the uploaded files
file_path = next(iter(uploaded))


Saving Joebiden-Fulltranscription.xlsx to Joebiden-Fulltranscription (3).xlsx


In [None]:
import pandas as pd

# Read the uploaded Excel file
df = pd.read_excel("/content/Joebiden-Fulltranscription.xlsx")

# Display the column names
print(df.columns)


Index(['file', 'transcription'], dtype='object')


In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to determine if a sentence is a fragment
def is_fragment(sentence):
    # Simple heuristic: consider a sentence a fragment if it's too short or doesn't end with proper punctuation
    min_length = 5
    if len(sentence) < min_length:
        if sentence in ["Oh!", "No!"]:  # Allow specific short exclamations
            return False
        return True
    if not sentence[-1] in ".!?":
        return True
    return False

# Improved function to clean transcription by removing fragments
def clean_transcription(transcription):
    doc = nlp(transcription)
    sentences = list(doc.sents)

    meaningful_sentences = []
    for sent in sentences:
        text = sent.text.strip()
        if not is_fragment(text) and (len(text.split()) > 2 or text in ["Oh!", "No!"]):
            meaningful_sentences.append(text)

    # Join the meaningful sentences back into a cleaned transcription
    cleaned_transcription = " ".join(meaningful_sentences)
    return cleaned_transcription

# Check and handle NaN values, convert all entries to strings
correct_column_name = "transcription"  # Replace 'transcription' with the actual column name
df[correct_column_name] = df[correct_column_name].fillna('').astype(str)

# Apply the cleaning function
df['Cleaned_Transcription'] = df[correct_column_name].apply(clean_transcription)

# Save the cleaned transcriptions to a new Excel file
output_file_path = "cleaned_transcriptions_fi.xlsx"
df.to_excel(output_file_path, index=False)

# Download the cleaned Excel file
files.download(output_file_path)

print("Transcriptions cleaned and saved to:", output_file_path)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Transcriptions cleaned and saved to: cleaned_transcriptions_fi.xlsx
