## Import Libraries

In [1]:
import whisper
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

## Check FFmpeg Installation

Whisper requires ffmpeg to process audio files. Let's verify it's installed.

In [2]:
import subprocess
import shutil

# Check if ffmpeg is available
ffmpeg_path = shutil.which("ffmpeg")

if ffmpeg_path:
    print(f"‚úÖ FFmpeg found at: {ffmpeg_path}")
    # Check version
    try:
        result = subprocess.run(["ffmpeg", "-version"], capture_output=True, text=True)
        version_line = result.stdout.split('\n')[0]
        print(f"   {version_line}")
    except:
        print("   (Could not check version)")
else:
    print("‚ùå FFmpeg NOT found!")
    print("\n‚ö†Ô∏è  Please install ffmpeg:")
    print("   Option 1: winget install \"FFmpeg (Essentials Build)\"")
    print("   Option 2: Download from https://ffmpeg.org/download.html")
    print("   Option 3: Install via Chocolatey: choco install ffmpeg")
    print("\n   After installation, restart this notebook kernel.")

‚úÖ FFmpeg found at: C:\Users\tomer\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg.Essentials_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-8.0.1-essentials_build/bin\ffmpeg.EXE
   ffmpeg version 8.0.1-essentials_build-www.gyan.dev Copyright (c) 2000-2025 the FFmpeg developers


## Load Whisper Model

Using the `base` model for a good balance between speed and accuracy.

In [3]:
print("Loading Whisper model...")
model = whisper.load_model("base")
print("Model loaded successfully!")

Loading Whisper model...
Model loaded successfully!


## Transcribe Audio Files

Process all audio files from `good_answers` and `bad_answers` folders, then save transcripts to separate output folders.

In [4]:
def transcribe_folder(input_folder, output_folder, prefix):
    """
    Transcribe all audio files in input_folder and save as text files in output_folder.
    
    Args:
        input_folder: Path to folder containing audio files
        output_folder: Path to save transcript text files
        prefix: Prefix for output files (e.g., 'good' or 'bad')
    """
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    
    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Get all audio files (common audio extensions)
    audio_extensions = {'.mp3', '.wav', '.m4a', '.flac', '.ogg', '.opus', '.webm'}
    audio_files = sorted([f for f in input_path.iterdir() 
                         if f.is_file() and f.suffix.lower() in audio_extensions])
    
    if not audio_files:
        print(f"‚ö†Ô∏è  No audio files found in {input_folder}")
        return
    
    print(f"\nüìÅ Processing {len(audio_files)} files from {input_folder}...")
    
    results = []
    
    for idx, audio_file in enumerate(tqdm(audio_files, desc=f"Transcribing {prefix}")):
        try:
            # Transcribe audio
            result = model.transcribe(str(audio_file), language="he")  # Change language as needed
            transcript = result["text"]
            
            # Save transcript
            output_file = output_path / f"{prefix}_{idx+1:03d}.txt"
            output_file.write_text(transcript, encoding='utf-8')
            
            results.append({
                'original_file': audio_file.name,
                'transcript_file': output_file.name,
                'text_length': len(transcript)
            })
            
        except Exception as e:
            print(f"‚ùå Error processing {audio_file.name}: {e}")
            results.append({
                'original_file': audio_file.name,
                'transcript_file': 'ERROR',
                'text_length': 0
            })
    
    print(f"‚úÖ Completed {prefix} transcriptions!")
    return pd.DataFrame(results)

### Process Good Answers

In [5]:
good_df = transcribe_folder(
    input_folder="../data/good_answers",
    output_folder="../data/good_answer_transcripts",
    prefix="good"
)

# Display summary
if good_df is not None:
    display(good_df)


üìÅ Processing 33 files from ../data/good_answers...


Transcribing good: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 33/33 [08:06<00:00, 14.75s/it]

‚úÖ Completed good transcriptions!





Unnamed: 0,original_file,transcript_file,text_length
0,good_001.m4a,good_001.txt,494
1,good_002.m4a,good_002.txt,1613
2,good_003.m4a,good_003.txt,575
3,good_004.m4a,good_004.txt,713
4,good_005.m4a,good_005.txt,454
5,good_006.m4a,good_006.txt,1228
6,good_007.m4a,good_007.txt,381
7,good_008.m4a,good_008.txt,434
8,good_009.m4a,good_009.txt,570
9,good_010.m4a,good_010.txt,344


### Process Bad Answers

In [6]:
bad_df = transcribe_folder(
    input_folder="../data/bad_answers",
    output_folder="../data/bad_answer_transcripts",
    prefix="bad"
)

# Display summary
if bad_df is not None:
    display(bad_df)


üìÅ Processing 13 files from ../data/bad_answers...


Transcribing bad: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:53<00:00,  4.15s/it]

‚úÖ Completed bad transcriptions!





Unnamed: 0,original_file,transcript_file,text_length
0,bad_001.m4a,bad_001.txt,122
1,bad_002.m4a,bad_002.txt,95
2,bad_003.m4a,bad_003.txt,178
3,bad_004.m4a,bad_004.txt,158
4,bad_005.m4a,bad_005.txt,129
5,bad_006.m4a,bad_006.txt,782
6,bad_007.m4a,bad_007.txt,244
7,bad_008.m4a,bad_008.txt,125
8,bad_009.m4a,bad_009.txt,107
9,bad_010.m4a,bad_010.txt,177


## Summary Statistics

In [7]:
print("=" * 50)
print("TRANSCRIPTION SUMMARY")
print("=" * 50)

if good_df is not None:
    print(f"\n‚úÖ Good Answers:")
    print(f"   Files processed: {len(good_df)}")
    print(f"   Avg text length: {good_df['text_length'].mean():.0f} chars")
    print(f"   Total text: {good_df['text_length'].sum():,} chars")

if bad_df is not None:
    print(f"\n‚ùå Bad Answers:")
    print(f"   Files processed: {len(bad_df)}")
    print(f"   Avg text length: {bad_df['text_length'].mean():.0f} chars")
    print(f"   Total text: {bad_df['text_length'].sum():,} chars")

print("\n" + "=" * 50)

TRANSCRIPTION SUMMARY

‚úÖ Good Answers:
   Files processed: 33
   Avg text length: 624 chars
   Total text: 20,597 chars

‚ùå Bad Answers:
   Files processed: 13
   Avg text length: 209 chars
   Total text: 2,722 chars

