From 2b8b198b631b61df37d1868c2cb59d7013f98b81 Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Thu, 2 Apr 2026 16:39:39 +0530 Subject: [PATCH 1/3] feat: Add hyphenation_character method to Lang Returns language-specific hyphenation character: SOFT HYPHEN by default, empty string for 12 Indic scripts where visual hyphenation is not conventional. --- src/lang.rs | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/lang.rs b/src/lang.rs index 5568ace..ef996ad 100644 --- a/src/lang.rs +++ b/src/lang.rs @@ -368,6 +368,40 @@ impl Lang { } } + /// The default character used to join syllables. + /// + /// Returns `Some('\u{ad}')` (SOFT HYPHEN) for most languages, but `None` + /// for Indic scripts where visual hyphenation is not conventional. + pub fn hyphenation_character(self) -> Option { + match self { + #[cfg(feature = "assamese")] + Self::Assamese => None, + #[cfg(feature = "bengali")] + Self::Bengali => None, + #[cfg(feature = "gujarati")] + Self::Gujarati => None, + #[cfg(feature = "hindi")] + Self::Hindi => None, + #[cfg(feature = "kannada")] + Self::Kannada => None, + #[cfg(feature = "malayalam")] + Self::Malayalam => None, + #[cfg(feature = "marathi")] + Self::Marathi => None, + #[cfg(feature = "oriya")] + Self::Oriya => None, + #[cfg(feature = "panjabi")] + Self::Panjabi => None, + #[cfg(feature = "sanskrit")] + Self::Sanskrit => None, + #[cfg(feature = "tamil")] + Self::Tamil => None, + #[cfg(feature = "telugu")] + Self::Telugu => None, + _ => Some('\u{ad}'), + } + } + fn root(self) -> State<'static> { match self { #[cfg(feature = "afrikaans")] From c55918e05aa65bae4e06d5e74b066644a05827da Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Fri, 3 Apr 2026 14:48:45 +0530 Subject: [PATCH 2/3] docs: add hyphenation_character note to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index a75f039..00a2baf 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ hypher = "0.1" disable the `alloc` feature, but then overly long words lead to a panic. - Support for many languages. - No unsafe code, no dependencies, no std. +- Hyphenation character awareness: `Lang::hyphenation_character()` returns + `None` for Indic scripts where visual hyphenation is not conventional. ## Example ```rust From 510e718220175cea412d75480af977fefdc4bce8 Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Fri, 3 Apr 2026 15:13:23 +0530 Subject: [PATCH 3/3] refactor: generate hyphenation_character in tests/generate.rs - Add hyphenation_character() generation to write_lang function - Add is_indic_script helper to identify Indic scripts (Beng, Deva, Gujr, Guru, Knda, Mlym, Orya, Taml, Telu) - Generate None for Indic scripts, Some('\u{ad}') for all others - Regenerate src/lang.rs with the generated method replacing manual implementation - Keep generated code in sync with source of truth --- tests/generate.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/generate.rs b/tests/generate.rs index c4f40fd..8c3ead8 100644 --- a/tests/generate.rs +++ b/tests/generate.rs @@ -160,6 +160,30 @@ fn write_lang( writeln!(w, " }}")?; writeln!(w)?; + // Implementation of `hyphenation_character`. + writeln!(w, " /// The default character used to join syllables.")?; + writeln!(w, " ///")?; + writeln!(w, " /// Returns `Some('\\u{{ad}}')` (SOFT HYPHEN) for most languages, but `None`")?; + writeln!( + w, + " /// for Indic scripts where visual hyphenation is not conventional." + )?; + writeln!(w, " pub fn hyphenation_character(self) -> Option {{")?; + writeln!(w, " match self {{")?; + for &(name, _, _, script, ..) in languages { + if !is_indic_script(script) { + continue; + } + let feature = name.to_lowercase(); + write!(w, " ")?; + write_cfg(w, &feature)?; + writeln!(w, " Self::{name} => None,")?; + } + writeln!(w, " _ => Some('\\u{{ad}}'),")?; + writeln!(w, " }}")?; + writeln!(w, " }}")?; + writeln!(w)?; + // Implementation of `root`. writeln!(w, " fn root(self) -> State<'static> {{")?; writeln!(w, " match self {{")?; @@ -175,6 +199,14 @@ fn write_lang( writeln!(w, "}}") } +/// Returns true for Indic scripts where visual hyphenation is not conventional. +fn is_indic_script(script: &str) -> bool { + matches!( + script, + "Beng" | "Deva" | "Gujr" | "Guru" | "Knda" | "Mlym" | "Orya" | "Taml" | "Telu" + ) +} + fn write_cfg(w: &mut String, feature: &str) -> fmt::Result { writeln!(w, r#"#[cfg(feature = "{feature}")]"#) }