Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion crates/vectorless-compiler/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@ use vectorless_utils::fingerprint::{Fingerprint, Fingerprinter};
use std::path::PathBuf;

/// Index mode for document processing.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SourceFormat {
/// Auto-detect format from file extension.
Auto,
/// Force Markdown format.
Markdown,
/// Force PDF format.
Pdf,
/// Custom format resolved via [`ParserRegistry`](crate::parse::ParserRegistry).
Custom(String),
}

impl Default for SourceFormat {
Expand Down
3 changes: 3 additions & 0 deletions crates/vectorless-compiler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ pub mod summary;
// Re-export main types from pipeline
pub use pipeline::{CompileMetrics, CompileResult, CompilerInput, PipelineExecutor};

// Re-export parser plugin types
pub use parse::{Parser, ParserRegistry};

// Re-export config types
pub use config::{PipelineOptions, SourceFormat, ThinningConfig};
pub use vectorless_document::ReasoningIndexConfig;
Expand Down
43 changes: 43 additions & 0 deletions crates/vectorless-compiler/src/parse/markdown/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,46 @@ mod frontmatter;
mod parser;

pub use parser::MarkdownParser;

use crate::parse::{ParseResult, Parser};
use std::path::Path;
use vectorless_error::Result;

/// [`Parser`] trait adapter for [`MarkdownParser`].
pub struct MarkdownParserAdapter {
inner: MarkdownParser,
}

impl MarkdownParserAdapter {
/// Create a new Markdown parser adapter.
pub fn new() -> Self {
Self {
inner: MarkdownParser::new(),
}
}
}

#[async_trait::async_trait]
impl Parser for MarkdownParserAdapter {
fn name(&self) -> &str {
"markdown"
}

fn extensions(&self) -> &[&str] {
&["md", "markdown"]
}

async fn parse_content(&self, content: &str) -> Result<ParseResult> {
self.inner.parse(content).await
}

async fn parse_file(&self, path: &Path) -> Result<ParseResult> {
self.inner.parse_file(path).await
}

async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
let content = std::str::from_utf8(data)
.map_err(|e| vectorless_error::Error::Parse(format!("Invalid UTF-8: {}", e)))?;
self.inner.parse(content).await
}
}
156 changes: 154 additions & 2 deletions crates/vectorless-compiler/src/parse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,26 @@

//! Document parsing for the compile pipeline.
//!
//! Supports Markdown and PDF formats. Parsing is dispatched directly
//! via `match` — no trait objects or registry needed.
//! Supports Markdown and PDF formats out of the box. Custom parsers can be
//! added via the [`Parser`] trait and [`ParserRegistry`].
//!
//! # Adding a custom parser
//!
//! ```rust,ignore
//! use vectorless_compiler::parse::{Parser, ParseResult, ParserRegistry};
//!
//! struct MyParser;
//!
//! #[async_trait]
//! impl Parser for MyParser {
//! fn name(&self) -> &str { "my-format" }
//! fn extensions(&self) -> &[&str] { &["foo", "bar"] }
//! async fn parse_content(&self, content: &str) -> Result<ParseResult> { ... }
//! async fn parse_file(&self, path: &Path) -> Result<ParseResult> { ... }
//! }
//!
//! let registry = ParserRegistry::default_parsers(None).with(MyParser);
//! ```

pub mod markdown;
pub mod pdf;
Expand All @@ -14,12 +32,134 @@ pub mod types;
// Re-export core types at module level
pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode};

use std::collections::HashMap;
use std::path::Path;

use crate::parse::markdown::MarkdownParser;
use vectorless_error::Result;
use vectorless_llm::LlmClient;

// ---------------------------------------------------------------------------
// Parser trait
// ---------------------------------------------------------------------------

/// Trait for document format parsers.
///
/// Implement this to add support for a new document format.
/// Register via [`ParserRegistry::register`] or [`ParserRegistry::with`].
#[async_trait::async_trait]
pub trait Parser: Send + Sync {
/// Parser name (e.g., "markdown", "pdf", "code").
fn name(&self) -> &str;

/// File extensions this parser handles, without dot (e.g., `["py", "rs"]`).
fn extensions(&self) -> &[&str] {
&[]
}

/// Parse string content into raw nodes.
async fn parse_content(&self, content: &str) -> Result<ParseResult>;

/// Parse a file into raw nodes.
async fn parse_file(&self, path: &Path) -> Result<ParseResult>;

/// Parse binary data into raw nodes.
async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
let _ = data;
Err(vectorless_error::Error::Parse(
"Binary parsing not supported by this parser".into(),
))
}
}

// ---------------------------------------------------------------------------
// ParserRegistry
// ---------------------------------------------------------------------------

/// Registry of document format parsers.
///
/// Maps parser names and file extensions to [`Parser`] implementations.
/// Built-in parsers for Markdown and PDF are provided by [`ParserRegistry::default_parsers`].
pub struct ParserRegistry {
parsers: HashMap<String, Box<dyn Parser>>,
extension_map: HashMap<String, String>,
}

impl ParserRegistry {
/// Create an empty registry.
pub fn new() -> Self {
Self {
parsers: HashMap::new(),
extension_map: HashMap::new(),
}
}

/// Register a parser. Extensions declared by the parser are auto-indexed.
pub fn register(&mut self, parser: impl Parser + 'static) {
let name = parser.name().to_string();
for ext in parser.extensions() {
self.extension_map.insert(ext.to_lowercase(), name.clone());
}
self.parsers.insert(name, Box::new(parser));
}

/// Builder-style registration.
pub fn with(mut self, parser: impl Parser + 'static) -> Self {
self.register(parser);
self
}

/// Get a parser by name.
pub fn get(&self, name: &str) -> Option<&dyn Parser> {
self.parsers.get(name).map(|p| p.as_ref())
}

/// Get a parser by file extension (lowercase).
pub fn get_by_extension(&self, ext: &str) -> Option<&dyn Parser> {
self.extension_map
.get(&ext.to_lowercase())
.and_then(|name| self.parsers.get(name))
.map(|p| p.as_ref())
}

/// Default registry with built-in Markdown + PDF parsers.
pub fn default_parsers(llm_client: Option<LlmClient>) -> Self {
let mut registry = Self::new();
registry.register(markdown::MarkdownParserAdapter::new());
registry.register(pdf::PdfParserAdapter::new(llm_client));
registry
}

/// List all registered parser names.
pub fn parser_names(&self) -> Vec<&str> {
self.parsers.keys().map(|s| s.as_str()).collect()
}

/// List all supported file extensions (lowercase, no dot).
pub fn supported_extensions(&self) -> Vec<&str> {
self.extension_map.keys().map(|s| s.as_str()).collect()
}
}

impl Default for ParserRegistry {
fn default() -> Self {
Self::default_parsers(None)
}
}

impl std::fmt::Debug for ParserRegistry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ParserRegistry")
.field("parsers", &self.parsers.keys().collect::<Vec<_>>())
.field("extensions", &self.extension_map)
.finish()
}
}

// ---------------------------------------------------------------------------
// Legacy free functions (backward compat — delegate to default registry)
// ---------------------------------------------------------------------------

/// Parse a string content document.
pub async fn parse_content(
content: &str,
Expand All @@ -34,6 +174,10 @@ pub async fn parse_content(
DocumentFormat::Pdf => Err(vectorless_error::Error::Parse(
"PDF requires bytes, not string content".to_string(),
)),
_ => Err(vectorless_error::Error::Parse(format!(
"Unsupported format for content parsing: {:?}",
format
))),
}
}

Expand All @@ -55,6 +199,10 @@ pub async fn parse_file(
};
parser.parse_file(path).await
}
_ => Err(vectorless_error::Error::Parse(format!(
"Unsupported format for file parsing: {:?}",
format
))),
}
}

Expand All @@ -79,6 +227,10 @@ pub async fn parse_bytes(
};
parser.parse_bytes_async(bytes, None).await
}
_ => Err(vectorless_error::Error::Parse(format!(
"Unsupported format for bytes parsing: {:?}",
format
))),
}
}

Expand Down
46 changes: 46 additions & 0 deletions crates/vectorless-compiler/src/parse/pdf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,49 @@ mod types;

pub use parser::PdfParser;
pub use types::PdfPage;

use crate::parse::{ParseResult, Parser};
use std::path::Path;
use vectorless_error::Result;
use vectorless_llm::LlmClient;

/// [`Parser`] trait adapter for [`PdfParser`].
pub struct PdfParserAdapter {
inner: PdfParser,
}

impl PdfParserAdapter {
/// Create a PDF parser adapter, optionally with LLM support.
pub fn new(llm_client: Option<LlmClient>) -> Self {
let inner = match llm_client {
Some(client) => PdfParser::with_llm_client(client),
None => PdfParser::new(),
};
Self { inner }
}
}

#[async_trait::async_trait]
impl Parser for PdfParserAdapter {
fn name(&self) -> &str {
"pdf"
}

fn extensions(&self) -> &[&str] {
&["pdf"]
}

async fn parse_content(&self, _content: &str) -> Result<ParseResult> {
Err(vectorless_error::Error::Parse(
"PDF requires bytes, not string content".into(),
))
}

async fn parse_file(&self, path: &Path) -> Result<ParseResult> {
self.inner.parse_file(path).await
}

async fn parse_bytes(&self, data: &[u8]) -> Result<ParseResult> {
self.inner.parse_bytes_async(data, None).await
}
}
Loading
Loading