In [1]:
import json

In [3]:
sourceFile = 'outputs/transcripts/sjV7NNwm1GU.json'
with open(sourceFile, 'r') as f:
    data = json.load(f)

text_content = ' '.join([item['text'] for item in data])

with open('output.txt', 'w') as f:
    f.write(text_content)

In [10]:
import httpx
import logging
from pathlib import Path
from typing import Optional

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

OLLAMA_ENDPOINT = "http://localhost:11434/api/generate"
MAX_RETRIES = 1
TIMEOUT = 1800  # Seconds

def count_words(text: str) -> int:
    """Count words in text string using Python's split() method"""
    return len(text.split())

def fix_punctuation(text: str) -> Optional[str]:
    """Correct punctuation in technical transcripts while preserving scientific terminology."""
    prompt = f"""
    ### TASK
    Act as a technical editor specializing in scientific transcripts. Correct punctuation while preserving:
    1. Original vocabulary and terminology
    2. Speaker's verbal cadence
    3. Technical content structure

    ### RULES
    - Convert verbal pauses to appropriate punctuation
    - Use semicolons for complex technical lists
    - Preserve ALL proper nouns/acronyms (MIT, Nobel, etc.)
    - Maintain informal contractions ("don't", "we're")
    - Fix comma splices and run-on sentences
    - Add missing question marks for rhetorical questions

    ### INPUT
    {text}

    ### RESPONSE FORMAT
    Return the corrected text only.
    """

    for attempt in range(MAX_RETRIES):
        try:
            response = httpx.post(
                OLLAMA_ENDPOINT,
                json={
                    "model": "llama3.2:latest",
                    "prompt": prompt,
                    "stream": False,
                    "options": {
                        "temperature": 0.1,
                        "num_ctx": 8192
                    }
                },
                headers={"Content-Type": "application/json"},
                timeout=TIMEOUT
            )
            response.raise_for_status()
            return response.json()["response"].strip()
            
        except httpx.HTTPStatusError as e:
            logger.error(f"HTTP error occurred: {e}")
        except httpx.RequestError as e:
            logger.error(f"Request error occurred: {e}")
        except KeyError as e:
            logger.error(f"Unexpected response format: {e}")
        
        logger.warning(f"Attempt {attempt + 1} failed. Retrying...")
    
    logger.error("All correction attempts failed.")
    return None

def process_transcript(input_file: str, output_file: str) -> bool:
    """Process the transcript file and save the corrected version."""
    try:
        input_path = Path(input_file)
        output_path = Path(output_file)

        if not input_path.exists():
            logger.error(f"Input file not found: {input_file}")
            return False

        # Read and count original words
        transcript = input_path.read_text(encoding='utf-8')
        original_count = count_words(transcript)
        logger.info(f"Original word count: {original_count}")

        # Process correction
        corrected_transcript = fix_punctuation(transcript)
        
        if corrected_transcript is None:
            logger.error("Failed to correct punctuation.")
            return False

        # Count corrected words and compare
        corrected_count = count_words(corrected_transcript)
        diff = corrected_count - original_count
        
        output_path.write_text(corrected_transcript, encoding='utf-8')
        logger.info(f"Corrected word count: {corrected_count}")
        logger.info(f"Word count difference: {diff} ({'+' if diff > 0 else ''}{diff})")

        return True

    except Exception as e:
        logger.exception(f"An unexpected error occurred: {e}")
        return False

if __name__ == "__main__":
    input_file = "output.txt"
    output_file = "corrected_transcript.txt"
    success = process_transcript(input_file, output_file)
    if not success:
        logger.error("Script execution failed.")


INFO:__main__:Original word count: 3600
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Corrected word count: 3595
INFO:__main__:Word count difference: -5 (-5)
