Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 43 additions & 25 deletions examkit/asr/whisper_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,27 @@ def transcribe_audio(
logger: logging.Logger = None
) -> List[Dict[str, Any]]:
"""
Transcribe audio file using faster-whisper.

Args:
audio_path: Path to audio file (WAV recommended).
model_size: Whisper model size (tiny, base, small, medium, large).
language: Language code (en, es, fr, etc.).
vad: Enable Voice Activity Detection.
logger: Logger instance.

Transcribe an audio file into timestamped segments using faster-whisper.

Transcribes the given audio file with the specified Whisper model and returns a list of segment dictionaries containing start/end timestamps and cleaned text.

Parameters:
audio_path (Path): Path to the audio file.
model_size (str): Whisper model size to load (e.g., "tiny", "base", "small", "medium", "large").
language (str): Language code hint for transcription (e.g., "en", "es", "fr").
vad (bool): Whether to enable voice activity detection to filter non-speech.
logger (logging.Logger | None): Optional logger for informational messages.

Returns:
List of transcription segments.
List[Dict[str, Any]]: A list of segments where each segment dictionary contains:
- "source": "asr"
- "type": "whisper"
- "start": start time in seconds
- "end": end time in seconds
- "text": transcribed text (stripped of surrounding whitespace)

Raises:
ImportError: If faster-whisper is not available.
"""
if not WHISPER_AVAILABLE:
raise ImportError("faster-whisper not available. Install with: pip install faster-whisper")
Expand Down Expand Up @@ -75,16 +85,21 @@ def transcribe_with_timestamps(
logger: logging.Logger = None
) -> Dict[str, Any]:
"""
Transcribe audio with detailed timestamp information.

Args:
audio_path: Path to audio file.
model_size: Whisper model size.
language: Language code.
logger: Logger instance.

Transcribe an audio file and return timestamped segments and summary metadata.

Parameters:
audio_path (Path): Path to the input audio file.
model_size (str): Whisper model size identifier (e.g., "small").
language (str): ISO language code to use for transcription.

Returns:
Dictionary with transcription and metadata.
result (dict): Dictionary containing:
- audio_file (str): String path of the input audio file.
- model (str): Model size used.
- language (str): Language code used.
- segments (List[dict]): List of segment dictionaries each with keys `source`, `type`, `start`, `end`, and `text`.
- total_duration (float): End time of the last segment in seconds, or 0.0 if no segments.
- total_segments (int): Number of segments.
"""
segments = transcribe_audio(audio_path, model_size, language, True, logger)

Expand All @@ -102,11 +117,14 @@ def transcribe_with_timestamps(

def export_to_vtt(segments: List[Dict[str, Any]], output_path: Path) -> None:
"""
Export transcription segments to VTT format.

Args:
segments: List of transcription segments.
output_path: Path for output VTT file.
Write transcription segments to a WebVTT file at the given path.

Each segment must be a mapping containing keys "start" (seconds, number), "end" (seconds, number)
and "text" (string). The function creates or overwrites the file at output_path and writes
a valid WEBVTT document where each segment is numbered and formatted as a time range with text.
Parameters:
segments (List[Dict[str, Any]]): Ordered transcription segments with "start", "end", and "text".
output_path (Path): Filesystem path to write the .vtt file; existing file will be overwritten.
"""
from examkit.utils.timecode import seconds_to_timecode

Expand All @@ -120,4 +138,4 @@ def export_to_vtt(segments: List[Dict[str, Any]], output_path: Path) -> None:

f.write(f"{i}\n")
f.write(f"{start} --> {end}\n")
f.write(f"{text}\n\n")
f.write(f"{text}\n\n")
32 changes: 19 additions & 13 deletions examkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,15 @@ def ingest(
log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level")
) -> None:
"""
Ingest and preprocess input files (video, transcript, slides, exam).

Validates inputs, extracts audio with ffmpeg, normalizes transcripts,
parses slides and exam papers, and saves processed data to cache.
Run the ingestion pipeline to preprocess input files and populate the cache.

Loads the manifest, invokes the ingestion pipeline to process videos, transcripts,
slides, and exam files, and writes processed artifacts to the specified cache
directory while printing status to the console. On failure the function logs the
error and exits the process with code 1.

Raises:
typer.Exit: Exits with code 1 when ingestion fails.
"""
logger = setup_logging(level=log_level, log_file=Path("logs/ingest.log"))
logger.info("Starting ingestion pipeline")
Expand Down Expand Up @@ -99,10 +104,9 @@ def build(
log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level")
) -> None:
"""
Build exam-ready PDF from processed inputs.

Runs the full pipeline: embeddings → topic mapping → RAG synthesis
with Ollama → diagrams → templating → Typst/Pandoc rendering.
Build an exam-ready PDF for a session using the provided configuration and write outputs to the specified path.

Prints the generated PDF, citations, coverage, and notes paths to the console. Exits with code 1 on error.
"""
logger = setup_logging(level=log_level, log_file=Path("logs/build.log"))
logger.info(f"Starting build pipeline for session: {session_id}")
Expand Down Expand Up @@ -193,10 +197,12 @@ def cache(
)
) -> None:
"""
Manage cache directory.

Actions:
clear - Remove all cached files safely
Manage the local cache directory for the CLI.

When `action` is "clear", delete the cache directory if it exists and recreate it; if the directory does not exist, print a warning. For any other `action`, print an error listing available actions and exit with a non-zero status.

Parameters:
action (str): Action to perform. Supported value: "clear".
"""
if action == "clear":
cache_dir = Path("cache")
Expand All @@ -219,4 +225,4 @@ def main() -> None:


if __name__ == "__main__":
main()
main()
22 changes: 11 additions & 11 deletions examkit/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,24 @@ class ExamKitConfig(BaseModel):
@classmethod
def from_yaml(cls, path: Path) -> "ExamKitConfig":
"""
Load configuration from a YAML file.

Args:
path: Path to the YAML configuration file.

Create an ExamKitConfig from a YAML file.
Parameters:
path (Path): Filesystem path to a YAML configuration file.
Returns:
ExamKitConfig instance.
ExamKitConfig: Configuration instance populated from the file's contents.
"""
with open(path, "r") as f:
data = yaml.safe_load(f)
return cls(**data)

def to_yaml(self, path: Path) -> None:
"""
Save configuration to a YAML file.

Args:
path: Path to save the YAML configuration file.
Write the current configuration to the given filesystem path as YAML.
Parameters:
path (Path): Filesystem path where the YAML file will be written.
"""
with open(path, "w") as f:
yaml.dump(self.model_dump(), f, default_flow_style=False, sort_keys=False)
yaml.dump(self.model_dump(), f, default_flow_style=False, sort_keys=False)
52 changes: 32 additions & 20 deletions examkit/ingestion/exam_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@

def extract_marks(text: str) -> int:
"""
Extract marks from text using common patterns.

Args:
text: Text containing marks information.

Extract the numeric marks present in a text line using common bracketed patterns.

Recognized patterns include forms like "[5 marks]", "(5 marks)", "[5]", and "(5)" (case-insensitive). The first matching numeric value is returned.

Parameters:
text (str): Input text that may contain marks.

Returns:
Number of marks (0 if not found).
int: Number of marks found, or 0 if no marks are detected.
"""
# Common patterns: [5 marks], (5 marks), [5], (5)
patterns = [
Expand All @@ -38,13 +40,24 @@ def extract_marks(text: str) -> int:

def parse_exam_structure(text: str) -> List[Dict[str, Any]]:
"""
Parse exam structure from text.

Args:
text: Exam paper text content.

Extract a structured list of questions and their parts from raw exam text.
Parameters:
text (str): Full textual content of an exam paper (may contain multiple lines).
Returns:
List of question dictionaries.
List[Dict[str, Any]]: A list of question dictionaries. Each question dictionary includes the keys:
- `source`: origin identifier (e.g., "exam")
- `section`: section letter if detected (e.g., "A") or None
- `question_id`: string identifier (e.g., "Q1")
- `question_number`: integer question number
- `text`: concatenated text of the question
- `parts`: list of part dictionaries
- `marks`: numeric marks extracted for the question
Each part dictionary includes:
- `part_id`: identifier for the part (e.g., "a", "i")
- `text`: concatenated text of the part
- `marks`: numeric marks extracted for the part
"""
questions = []
lines = text.split('\n')
Expand Down Expand Up @@ -114,14 +127,13 @@ def parse_exam_structure(text: str) -> List[Dict[str, Any]]:

def parse_exam(path: Path, logger: logging.Logger) -> List[Dict[str, Any]]:
"""
Parse exam paper PDF.

Args:
path: Path to exam PDF file.
logger: Logger instance.

Parse an exam PDF and return its extracted question structure.

Parameters:
path (Path): Filesystem path to the exam PDF.

Returns:
List of question dictionaries.
List[Dict[str, Any]]: A list of question dictionaries. Each dictionary includes keys such as `source`, `section`, `question_id`, `question_number`, `text`, `parts` (a list of part dictionaries with `part_id`, `text`, and `marks`), and `marks`.
"""
logger.info(f"Parsing exam paper: {path}")

Expand All @@ -138,4 +150,4 @@ def parse_exam(path: Path, logger: logging.Logger) -> List[Dict[str, Any]]:
questions = parse_exam_structure(full_text)

logger.info(f"Parsed {len(questions)} questions from exam paper")
return questions
return questions
54 changes: 31 additions & 23 deletions examkit/ingestion/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@

def validate_manifest(manifest: Dict[str, Any]) -> bool:
"""
Validate manifest structure and file existence.

Args:
manifest: Manifest dictionary.

Validate that a manifest contains required fields and that its 'inputs' value is a dictionary.
Parameters:
manifest (Dict[str, Any]): Manifest data expected to include at least the keys `"session_id"` and `"inputs"`.
Returns:
True if valid, raises ValueError otherwise.
bool: `True` if the manifest contains the required keys and `'inputs'` is a dictionary.

Raises:
ValueError: If a required key is missing or if `manifest["inputs"]` is not a dictionary.
"""
required_keys = ["session_id", "inputs"]
for key in required_keys:
Expand All @@ -36,15 +39,17 @@ def validate_manifest(manifest: Dict[str, Any]) -> bool:

def extract_audio_from_video(video_path: Path, output_path: Path, logger: logging.Logger) -> Path:
"""
Extract audio from video file using ffmpeg.

Args:
video_path: Path to input video file.
output_path: Path for output WAV file.
logger: Logger instance.

Extract audio from a video file and save it as a 16 kHz mono PCM WAV.

Parameters:
video_path (Path): Path to the input video file.
output_path (Path): Destination path for the extracted WAV file; the function will create the parent directory if needed.

Returns:
Path to extracted audio file.
Path: Path to the extracted audio file.

Raises:
ffmpeg.Error: If FFmpeg fails during extraction.
"""
logger.info(f"Extracting audio from {video_path}")

Expand Down Expand Up @@ -76,15 +81,18 @@ def ingest_pipeline(
logger: logging.Logger
) -> Dict[str, Any]:
"""
Run the complete ingestion pipeline.

Args:
manifest: Manifest describing input files.
cache_dir: Directory for cached/processed files.
logger: Logger instance.

Run the ingestion pipeline for a session and produce processed outputs in the cache directory.
Parameters:
manifest (Dict[str, Any]): Manifest containing at least "session_id" and an "inputs" mapping of optional keys: "video", "transcript", "slides", "exam".
cache_dir (Path): Directory where processed files and the normalized manifest will be written.
logger (logging.Logger): Logger used for informational and warning messages.
Returns:
Dictionary with paths to processed files.
result (Dict[str, Any]): Dictionary with:
- "session_id" (str): The manifest's session identifier.
- "processed_files" (Dict[str, str]): Mapping of output types ("audio", "transcript", "slides", "exam") to their file paths in the cache for inputs that were present and processed.
- "normalized_manifest" (str): Path to the written normalized manifest JSON in the cache.
"""
from examkit.ingestion.transcript_normalizer import normalize_transcript
from examkit.ingestion.slides_parser import parse_slides
Expand Down Expand Up @@ -159,4 +167,4 @@ def ingest_pipeline(
result["normalized_manifest"] = str(normalized_manifest_path)

logger.info("Ingestion pipeline complete")
return result
return result
Loading