thecoder8890 · coderabbitai · Nov 10, 2025
diff --git a/examkit/asr/whisper_runner.py b/examkit/asr/whisper_runner.py
@@ -21,17 +21,27 @@ def transcribe_audio(
     logger: logging.Logger = None
 ) -> List[Dict[str, Any]]:
     """
-    Transcribe audio file using faster-whisper.
-
-    Args:
-        audio_path: Path to audio file (WAV recommended).
-        model_size: Whisper model size (tiny, base, small, medium, large).
-        language: Language code (en, es, fr, etc.).
-        vad: Enable Voice Activity Detection.
-        logger: Logger instance.
-
+    Transcribe an audio file into timestamped segments using faster-whisper.
+
+    Transcribes the given audio file with the specified Whisper model and returns a list of segment dictionaries containing start/end timestamps and cleaned text.
+
+    Parameters:
+        audio_path (Path): Path to the audio file.
+        model_size (str): Whisper model size to load (e.g., "tiny", "base", "small", "medium", "large").
+        language (str): Language code hint for transcription (e.g., "en", "es", "fr").
+        vad (bool): Whether to enable voice activity detection to filter non-speech.
+        logger (logging.Logger | None): Optional logger for informational messages.
+
     Returns:
-        List of transcription segments.
+        List[Dict[str, Any]]: A list of segments where each segment dictionary contains:
+            - "source": "asr"
+            - "type": "whisper"
+            - "start": start time in seconds
+            - "end": end time in seconds
+            - "text": transcribed text (stripped of surrounding whitespace)
+
+    Raises:
+        ImportError: If faster-whisper is not available.
     """
     if not WHISPER_AVAILABLE:
         raise ImportError("faster-whisper not available. Install with: pip install faster-whisper")
@@ -75,16 +85,21 @@ def transcribe_with_timestamps(
     logger: logging.Logger = None
 ) -> Dict[str, Any]:
     """
-    Transcribe audio with detailed timestamp information.
-
-    Args:
-        audio_path: Path to audio file.
-        model_size: Whisper model size.
-        language: Language code.
-        logger: Logger instance.
-
+    Transcribe an audio file and return timestamped segments and summary metadata.
+
+    Parameters:
+        audio_path (Path): Path to the input audio file.
+        model_size (str): Whisper model size identifier (e.g., "small").
+        language (str): ISO language code to use for transcription.
+
     Returns:
-        Dictionary with transcription and metadata.
+        result (dict): Dictionary containing:
+            - audio_file (str): String path of the input audio file.
+            - model (str): Model size used.
+            - language (str): Language code used.
+            - segments (List[dict]): List of segment dictionaries each with keys `source`, `type`, `start`, `end`, and `text`.
+            - total_duration (float): End time of the last segment in seconds, or 0.0 if no segments.
+            - total_segments (int): Number of segments.
     """
     segments = transcribe_audio(audio_path, model_size, language, True, logger)
 
@@ -102,11 +117,14 @@ def transcribe_with_timestamps(
 
 def export_to_vtt(segments: List[Dict[str, Any]], output_path: Path) -> None:
     """
-    Export transcription segments to VTT format.
-
-    Args:
-        segments: List of transcription segments.
-        output_path: Path for output VTT file.
+    Write transcription segments to a WebVTT file at the given path.
+
+    Each segment must be a mapping containing keys "start" (seconds, number), "end" (seconds, number)
+    and "text" (string). The function creates or overwrites the file at output_path and writes
+    a valid WEBVTT document where each segment is numbered and formatted as a time range with text.
+    Parameters:
+        segments (List[Dict[str, Any]]): Ordered transcription segments with "start", "end", and "text".
+        output_path (Path): Filesystem path to write the .vtt file; existing file will be overwritten.
     """
     from examkit.utils.timecode import seconds_to_timecode
 
@@ -120,4 +138,4 @@ def export_to_vtt(segments: List[Dict[str, Any]], output_path: Path) -> None:
 
             f.write(f"{i}\n")
             f.write(f"{start} --> {end}\n")
-            f.write(f"{text}\n\n")
+            f.write(f"{text}\n\n")
diff --git a/examkit/cli.py b/examkit/cli.py
@@ -44,10 +44,15 @@ def ingest(
     log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level")
 ) -> None:
     """
-    Ingest and preprocess input files (video, transcript, slides, exam).
-
-    Validates inputs, extracts audio with ffmpeg, normalizes transcripts,
-    parses slides and exam papers, and saves processed data to cache.
+    Run the ingestion pipeline to preprocess input files and populate the cache.
+
+    Loads the manifest, invokes the ingestion pipeline to process videos, transcripts,
+    slides, and exam files, and writes processed artifacts to the specified cache
+    directory while printing status to the console. On failure the function logs the
+    error and exits the process with code 1.
+
+    Raises:
+    	typer.Exit: Exits with code 1 when ingestion fails.
     """
     logger = setup_logging(level=log_level, log_file=Path("logs/ingest.log"))
     logger.info("Starting ingestion pipeline")
@@ -99,10 +104,9 @@ def build(
     log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level")
 ) -> None:
     """
-    Build exam-ready PDF from processed inputs.
-
-    Runs the full pipeline: embeddings → topic mapping → RAG synthesis
-    with Ollama → diagrams → templating → Typst/Pandoc rendering.
+    Build an exam-ready PDF for a session using the provided configuration and write outputs to the specified path.
+
+    Prints the generated PDF, citations, coverage, and notes paths to the console. Exits with code 1 on error.
     """
     logger = setup_logging(level=log_level, log_file=Path("logs/build.log"))
     logger.info(f"Starting build pipeline for session: {session_id}")
@@ -193,10 +197,12 @@ def cache(
     )
 ) -> None:
     """
-    Manage cache directory.
-
-    Actions:
-      clear - Remove all cached files safely
+    Manage the local cache directory for the CLI.
+
+    When `action` is "clear", delete the cache directory if it exists and recreate it; if the directory does not exist, print a warning. For any other `action`, print an error listing available actions and exit with a non-zero status.
+
+    Parameters:
+        action (str): Action to perform. Supported value: "clear".
     """
     if action == "clear":
         cache_dir = Path("cache")
@@ -219,4 +225,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/examkit/config.py b/examkit/config.py
@@ -81,24 +81,24 @@ class ExamKitConfig(BaseModel):
     @classmethod
     def from_yaml(cls, path: Path) -> "ExamKitConfig":
         """
-        Load configuration from a YAML file.
-
-        Args:
-            path: Path to the YAML configuration file.
-
+        Create an ExamKitConfig from a YAML file.
+        
+        Parameters:
+            path (Path): Filesystem path to a YAML configuration file.
+        
         Returns:
-            ExamKitConfig instance.
+            ExamKitConfig: Configuration instance populated from the file's contents.
         """
         with open(path, "r") as f:
             data = yaml.safe_load(f)
         return cls(**data)
 
     def to_yaml(self, path: Path) -> None:
         """
-        Save configuration to a YAML file.
-
-        Args:
-            path: Path to save the YAML configuration file.
+        Write the current configuration to the given filesystem path as YAML.
+        
+        Parameters:
+            path (Path): Filesystem path where the YAML file will be written.
         """
         with open(path, "w") as f:
-            yaml.dump(self.model_dump(), f, default_flow_style=False, sort_keys=False)
+            yaml.dump(self.model_dump(), f, default_flow_style=False, sort_keys=False)
diff --git a/examkit/ingestion/exam_parser.py b/examkit/ingestion/exam_parser.py
@@ -12,13 +12,15 @@
 
 def extract_marks(text: str) -> int:
     """
-    Extract marks from text using common patterns.
-
-    Args:
-        text: Text containing marks information.
-
+    Extract the numeric marks present in a text line using common bracketed patterns.
+
+    Recognized patterns include forms like "[5 marks]", "(5 marks)", "[5]", and "(5)" (case-insensitive). The first matching numeric value is returned.
+
+    Parameters:
+        text (str): Input text that may contain marks.
+
     Returns:
-        Number of marks (0 if not found).
+        int: Number of marks found, or 0 if no marks are detected.
     """
     # Common patterns: [5 marks], (5 marks), [5], (5)
     patterns = [
@@ -38,13 +40,24 @@ def extract_marks(text: str) -> int:
 
 def parse_exam_structure(text: str) -> List[Dict[str, Any]]:
     """
-    Parse exam structure from text.
-
-    Args:
-        text: Exam paper text content.
-
+    Extract a structured list of questions and their parts from raw exam text.
+    
+    Parameters:
+        text (str): Full textual content of an exam paper (may contain multiple lines).
+    
     Returns:
-        List of question dictionaries.
+        List[Dict[str, Any]]: A list of question dictionaries. Each question dictionary includes the keys:
+            - `source`: origin identifier (e.g., "exam")
+            - `section`: section letter if detected (e.g., "A") or None
+            - `question_id`: string identifier (e.g., "Q1")
+            - `question_number`: integer question number
+            - `text`: concatenated text of the question
+            - `parts`: list of part dictionaries
+            - `marks`: numeric marks extracted for the question
+        Each part dictionary includes:
+            - `part_id`: identifier for the part (e.g., "a", "i")
+            - `text`: concatenated text of the part
+            - `marks`: numeric marks extracted for the part
     """
     questions = []
     lines = text.split('\n')
@@ -114,14 +127,13 @@ def parse_exam_structure(text: str) -> List[Dict[str, Any]]:
 
 def parse_exam(path: Path, logger: logging.Logger) -> List[Dict[str, Any]]:
     """
-    Parse exam paper PDF.
-
-    Args:
-        path: Path to exam PDF file.
-        logger: Logger instance.
-
+    Parse an exam PDF and return its extracted question structure.
+
+    Parameters:
+        path (Path): Filesystem path to the exam PDF.
+
     Returns:
-        List of question dictionaries.
+        List[Dict[str, Any]]: A list of question dictionaries. Each dictionary includes keys such as `source`, `section`, `question_id`, `question_number`, `text`, `parts` (a list of part dictionaries with `part_id`, `text`, and `marks`), and `marks`.
     """
     logger.info(f"Parsing exam paper: {path}")
 
@@ -138,4 +150,4 @@ def parse_exam(path: Path, logger: logging.Logger) -> List[Dict[str, Any]]:
     questions = parse_exam_structure(full_text)
 
     logger.info(f"Parsed {len(questions)} questions from exam paper")
-    return questions
+    return questions
diff --git a/examkit/ingestion/ingest.py b/examkit/ingestion/ingest.py
@@ -14,13 +14,16 @@
 
 def validate_manifest(manifest: Dict[str, Any]) -> bool:
     """
-    Validate manifest structure and file existence.
-
-    Args:
-        manifest: Manifest dictionary.
-
+    Validate that a manifest contains required fields and that its 'inputs' value is a dictionary.
+    
+    Parameters:
+        manifest (Dict[str, Any]): Manifest data expected to include at least the keys `"session_id"` and `"inputs"`.
+    
     Returns:
-        True if valid, raises ValueError otherwise.
+        bool: `True` if the manifest contains the required keys and `'inputs'` is a dictionary.
+
+    Raises:
+        ValueError: If a required key is missing or if `manifest["inputs"]` is not a dictionary.
     """
     required_keys = ["session_id", "inputs"]
     for key in required_keys:
@@ -36,15 +39,17 @@ def validate_manifest(manifest: Dict[str, Any]) -> bool:
 
 def extract_audio_from_video(video_path: Path, output_path: Path, logger: logging.Logger) -> Path:
     """
-    Extract audio from video file using ffmpeg.
-
-    Args:
-        video_path: Path to input video file.
-        output_path: Path for output WAV file.
-        logger: Logger instance.
-
+    Extract audio from a video file and save it as a 16 kHz mono PCM WAV.
+
+    Parameters:
+        video_path (Path): Path to the input video file.
+        output_path (Path): Destination path for the extracted WAV file; the function will create the parent directory if needed.
+
     Returns:
-        Path to extracted audio file.
+        Path: Path to the extracted audio file.
+
+    Raises:
+        ffmpeg.Error: If FFmpeg fails during extraction.
     """
     logger.info(f"Extracting audio from {video_path}")
 
@@ -76,15 +81,18 @@ def ingest_pipeline(
     logger: logging.Logger
 ) -> Dict[str, Any]:
     """
-    Run the complete ingestion pipeline.
-
-    Args:
-        manifest: Manifest describing input files.
-        cache_dir: Directory for cached/processed files.
-        logger: Logger instance.
-
+    Run the ingestion pipeline for a session and produce processed outputs in the cache directory.
+    
+    Parameters:
+        manifest (Dict[str, Any]): Manifest containing at least "session_id" and an "inputs" mapping of optional keys: "video", "transcript", "slides", "exam".
+        cache_dir (Path): Directory where processed files and the normalized manifest will be written.
+        logger (logging.Logger): Logger used for informational and warning messages.
+    
     Returns:
-        Dictionary with paths to processed files.
+        result (Dict[str, Any]): Dictionary with:
+            - "session_id" (str): The manifest's session identifier.
+            - "processed_files" (Dict[str, str]): Mapping of output types ("audio", "transcript", "slides", "exam") to their file paths in the cache for inputs that were present and processed.
+            - "normalized_manifest" (str): Path to the written normalized manifest JSON in the cache.
     """
     from examkit.ingestion.transcript_normalizer import normalize_transcript
     from examkit.ingestion.slides_parser import parse_slides
@@ -159,4 +167,4 @@ def ingest_pipeline(
     result["normalized_manifest"] = str(normalized_manifest_path)
 
     logger.info("Ingestion pipeline complete")
-    return result
+    return result