Skip to content

Commit

Permalink
Bump byteorder + hashbrown + whatlang (dropped some languages)
Browse files Browse the repository at this point in the history
  • Loading branch information
valeriansaliou committed Nov 3, 2021
1 parent 36daec2 commit 940d3c3
Show file tree
Hide file tree
Showing 27 changed files with 45 additions and 585 deletions.
27 changes: 19 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ fst-levenshtein = "0.2"
fst-regex = "0.2"
regex-syntax = "0.6"
twox-hash = "1.5"
byteorder = "1.3"
hashbrown = "0.9"
byteorder = "1.4"
hashbrown = "0.11"
linked_hash_set = "0.1"
whatlang = "0.11"
whatlang = "0.12"
regex = "1.4"

[target.'cfg(unix)'.dependencies]
Expand Down
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ Sonic supports a wide range of languages in its lexing system. If a language is
* 🇫🇷 French
* 🇩🇪 German
* 🇬🇷 Greek
* 🇳🇬 Hausa
* 🇮🇱 Hebrew
* 🇮🇳 Hindi
* 🇭🇺 Hungarian
Expand All @@ -219,7 +218,6 @@ Sonic supports a wide range of languages in its lexing system. If a language is
* 🇮🇳 Kannada
* 🇰🇭 Khmer
* 🇰🇷 Korean
* 🏳 Kurdish
* 🏳 Latin
* 🇱🇻 Latvian
* 🇱🇹 Lithuanian
Expand All @@ -232,18 +230,15 @@ Sonic supports a wide range of languages in its lexing system. If a language is
* 🇷🇺 Russian
* 🇸🇰 Slovak
* 🇸🇮 Slovene
* 🇸🇴 Somali
* 🇪🇸 Spanish
* 🇸🇪 Swedish
* 🇵🇭 Tagalog
* 🇮🇳 Tamil
* 🇹🇭 Thai
* 🇹🇷 Turkish
* 🇺🇦 Ukrainian
* 🇵🇰 Urdu
* 🇻🇳 Vietnamese
* 🇮🇱 Yiddish
* 🇳🇬 Yoruba
* 🇿🇦 Zulu

## How fast & lightweight is it?
Expand Down
102 changes: 19 additions & 83 deletions src/lexer/stopwords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,30 +33,28 @@ lazy_static! {
static ref STOPWORDS_HIN: HashSet<&'static str> = make(hin::STOPWORDS_HIN);
static ref STOPWORDS_JPN: HashSet<&'static str> = make(jpn::STOPWORDS_JPN);
static ref STOPWORDS_HEB: HashSet<&'static str> = make(heb::STOPWORDS_HEB);
static ref STOPWORDS_YDD: HashSet<&'static str> = make(ydd::STOPWORDS_YDD);
static ref STOPWORDS_YID: HashSet<&'static str> = make(yid::STOPWORDS_YID);
static ref STOPWORDS_POL: HashSet<&'static str> = make(pol::STOPWORDS_POL);
static ref STOPWORDS_AMH: HashSet<&'static str> = make(amh::STOPWORDS_AMH);
static ref STOPWORDS_TIR: HashSet<&'static str> = make(tir::STOPWORDS_TIR);
static ref STOPWORDS_JAV: HashSet<&'static str> = make(jav::STOPWORDS_JAV);
}

// Recursion group #3 (10 items)
lazy_static! {
static ref STOPWORDS_JAV: HashSet<&'static str> = make(jav::STOPWORDS_JAV);
static ref STOPWORDS_KOR: HashSet<&'static str> = make(kor::STOPWORDS_KOR);
static ref STOPWORDS_NOB: HashSet<&'static str> = make(nob::STOPWORDS_NOB);
static ref STOPWORDS_NNO: HashSet<&'static str> = make(nno::STOPWORDS_NNO);
static ref STOPWORDS_DAN: HashSet<&'static str> = make(dan::STOPWORDS_DAN);
static ref STOPWORDS_SWE: HashSet<&'static str> = make(swe::STOPWORDS_SWE);
static ref STOPWORDS_FIN: HashSet<&'static str> = make(fin::STOPWORDS_FIN);
static ref STOPWORDS_TUR: HashSet<&'static str> = make(tur::STOPWORDS_TUR);
static ref STOPWORDS_NLD: HashSet<&'static str> = make(nld::STOPWORDS_NLD);
static ref STOPWORDS_HUN: HashSet<&'static str> = make(hun::STOPWORDS_HUN);
static ref STOPWORDS_CES: HashSet<&'static str> = make(ces::STOPWORDS_CES);
static ref STOPWORDS_ELL: HashSet<&'static str> = make(ell::STOPWORDS_ELL);
}

// Recursion group #4 (10 items)
lazy_static! {
static ref STOPWORDS_CES: HashSet<&'static str> = make(ces::STOPWORDS_CES);
static ref STOPWORDS_ELL: HashSet<&'static str> = make(ell::STOPWORDS_ELL);
static ref STOPWORDS_BUL: HashSet<&'static str> = make(bul::STOPWORDS_BUL);
static ref STOPWORDS_BEL: HashSet<&'static str> = make(bel::STOPWORDS_BEL);
static ref STOPWORDS_MAR: HashSet<&'static str> = make(mar::STOPWORDS_MAR);
Expand All @@ -65,12 +63,12 @@ lazy_static! {
static ref STOPWORDS_SLV: HashSet<&'static str> = make(slv::STOPWORDS_SLV);
static ref STOPWORDS_HRV: HashSet<&'static str> = make(hrv::STOPWORDS_HRV);
static ref STOPWORDS_SRP: HashSet<&'static str> = make(srp::STOPWORDS_SRP);
static ref STOPWORDS_MKD: HashSet<&'static str> = make(mkd::STOPWORDS_MKD);
static ref STOPWORDS_LIT: HashSet<&'static str> = make(lit::STOPWORDS_LIT);
}

// Recursion group #5 (10 items)
lazy_static! {
static ref STOPWORDS_MKD: HashSet<&'static str> = make(mkd::STOPWORDS_MKD);
static ref STOPWORDS_LIT: HashSet<&'static str> = make(lit::STOPWORDS_LIT);
static ref STOPWORDS_LAV: HashSet<&'static str> = make(lav::STOPWORDS_LAV);
static ref STOPWORDS_EST: HashSet<&'static str> = make(est::STOPWORDS_EST);
static ref STOPWORDS_TAM: HashSet<&'static str> = make(tam::STOPWORDS_TAM);
Expand All @@ -79,55 +77,29 @@ lazy_static! {
static ref STOPWORDS_THA: HashSet<&'static str> = make(tha::STOPWORDS_THA);
static ref STOPWORDS_GUJ: HashSet<&'static str> = make(guj::STOPWORDS_GUJ);
static ref STOPWORDS_UZB: HashSet<&'static str> = make(uzb::STOPWORDS_UZB);
static ref STOPWORDS_PAN: HashSet<&'static str> = make(pan::STOPWORDS_PAN);
static ref STOPWORDS_AZE: HashSet<&'static str> = make(aze::STOPWORDS_AZE);
}

// Recursion group #6 (10 items)
lazy_static! {
static ref STOPWORDS_PAN: HashSet<&'static str> = make(pan::STOPWORDS_PAN);
static ref STOPWORDS_AZJ: HashSet<&'static str> = make(azj::STOPWORDS_AZJ);
static ref STOPWORDS_IND: HashSet<&'static str> = make(ind::STOPWORDS_IND);
static ref STOPWORDS_TEL: HashSet<&'static str> = make(tel::STOPWORDS_TEL);
static ref STOPWORDS_PES: HashSet<&'static str> = make(pes::STOPWORDS_PES);
static ref STOPWORDS_MAL: HashSet<&'static str> = make(mal::STOPWORDS_MAL);
static ref STOPWORDS_HAU: HashSet<&'static str> = make(hau::STOPWORDS_HAU);
static ref STOPWORDS_ORI: HashSet<&'static str> = make(ori::STOPWORDS_ORI);
static ref STOPWORDS_MYA: HashSet<&'static str> = make(mya::STOPWORDS_MYA);
static ref STOPWORDS_BHO: HashSet<&'static str> = make(bho::STOPWORDS_BHO);
}

// Recursion group #7 (10 items)
lazy_static! {
static ref STOPWORDS_TGL: HashSet<&'static str> = make(tgl::STOPWORDS_TGL);
static ref STOPWORDS_YOR: HashSet<&'static str> = make(yor::STOPWORDS_YOR);
static ref STOPWORDS_MAI: HashSet<&'static str> = make(mai::STOPWORDS_MAI);
static ref STOPWORDS_ORM: HashSet<&'static str> = make(orm::STOPWORDS_ORM);
static ref STOPWORDS_IBO: HashSet<&'static str> = make(ibo::STOPWORDS_IBO);
static ref STOPWORDS_CEB: HashSet<&'static str> = make(ceb::STOPWORDS_CEB);
static ref STOPWORDS_KUR: HashSet<&'static str> = make(kur::STOPWORDS_KUR);
static ref STOPWORDS_MLG: HashSet<&'static str> = make(mlg::STOPWORDS_MLG);
static ref STOPWORDS_SKR: HashSet<&'static str> = make(skr::STOPWORDS_SKR);
static ref STOPWORDS_NEP: HashSet<&'static str> = make(nep::STOPWORDS_NEP);
}

// Recursion group #8 (10 items)
lazy_static! {
static ref STOPWORDS_SIN: HashSet<&'static str> = make(sin::STOPWORDS_SIN);
static ref STOPWORDS_KHM: HashSet<&'static str> = make(khm::STOPWORDS_KHM);
static ref STOPWORDS_TUK: HashSet<&'static str> = make(tuk::STOPWORDS_TUK);
static ref STOPWORDS_SOM: HashSet<&'static str> = make(som::STOPWORDS_SOM);
static ref STOPWORDS_NYA: HashSet<&'static str> = make(nya::STOPWORDS_NYA);
static ref STOPWORDS_AKA: HashSet<&'static str> = make(aka::STOPWORDS_AKA);
static ref STOPWORDS_ZUL: HashSet<&'static str> = make(zul::STOPWORDS_ZUL);
static ref STOPWORDS_KIN: HashSet<&'static str> = make(kin::STOPWORDS_KIN);
static ref STOPWORDS_HAT: HashSet<&'static str> = make(hat::STOPWORDS_HAT);
static ref STOPWORDS_ILO: HashSet<&'static str> = make(ilo::STOPWORDS_ILO);
}

// Recursion group #9 (7 items)
// Recursion group #7 (7 items)
lazy_static! {
static ref STOPWORDS_RUN: HashSet<&'static str> = make(run::STOPWORDS_RUN);
static ref STOPWORDS_AKA: HashSet<&'static str> = make(aka::STOPWORDS_AKA);
static ref STOPWORDS_ZUL: HashSet<&'static str> = make(zul::STOPWORDS_ZUL);
static ref STOPWORDS_SNA: HashSet<&'static str> = make(sna::STOPWORDS_SNA);
static ref STOPWORDS_UIG: HashSet<&'static str> = make(uig::STOPWORDS_UIG);
static ref STOPWORDS_AFR: HashSet<&'static str> = make(afr::STOPWORDS_AFR);
static ref STOPWORDS_LAT: HashSet<&'static str> = make(lat::STOPWORDS_LAT);
static ref STOPWORDS_SLK: HashSet<&'static str> = make(slk::STOPWORDS_SLK);
Expand Down Expand Up @@ -223,14 +195,12 @@ impl LexerStopWord {
Lang::Hin => &*STOPWORDS_HIN,
Lang::Jpn => &*STOPWORDS_JPN,
Lang::Heb => &*STOPWORDS_HEB,
Lang::Ydd => &*STOPWORDS_YDD,
Lang::Yid => &*STOPWORDS_YID,
Lang::Pol => &*STOPWORDS_POL,
Lang::Amh => &*STOPWORDS_AMH,
Lang::Tir => &*STOPWORDS_TIR,
Lang::Jav => &*STOPWORDS_JAV,
Lang::Kor => &*STOPWORDS_KOR,
Lang::Nob => &*STOPWORDS_NOB,
Lang::Nno => &*STOPWORDS_NNO,
Lang::Dan => &*STOPWORDS_DAN,
Lang::Swe => &*STOPWORDS_SWE,
Lang::Fin => &*STOPWORDS_FIN,
Expand Down Expand Up @@ -258,38 +228,20 @@ impl LexerStopWord {
Lang::Guj => &*STOPWORDS_GUJ,
Lang::Uzb => &*STOPWORDS_UZB,
Lang::Pan => &*STOPWORDS_PAN,
Lang::Azj => &*STOPWORDS_AZJ,
Lang::Aze => &*STOPWORDS_AZE,
Lang::Ind => &*STOPWORDS_IND,
Lang::Tel => &*STOPWORDS_TEL,
Lang::Pes => &*STOPWORDS_PES,
Lang::Mal => &*STOPWORDS_MAL,
Lang::Hau => &*STOPWORDS_HAU,
Lang::Ori => &*STOPWORDS_ORI,
Lang::Mya => &*STOPWORDS_MYA,
Lang::Bho => &*STOPWORDS_BHO,
Lang::Tgl => &*STOPWORDS_TGL,
Lang::Yor => &*STOPWORDS_YOR,
Lang::Mai => &*STOPWORDS_MAI,
Lang::Orm => &*STOPWORDS_ORM,
Lang::Ibo => &*STOPWORDS_IBO,
Lang::Ceb => &*STOPWORDS_CEB,
Lang::Kur => &*STOPWORDS_KUR,
Lang::Mlg => &*STOPWORDS_MLG,
Lang::Skr => &*STOPWORDS_SKR,
Lang::Nep => &*STOPWORDS_NEP,
Lang::Sin => &*STOPWORDS_SIN,
Lang::Khm => &*STOPWORDS_KHM,
Lang::Tuk => &*STOPWORDS_TUK,
Lang::Som => &*STOPWORDS_SOM,
Lang::Nya => &*STOPWORDS_NYA,
Lang::Aka => &*STOPWORDS_AKA,
Lang::Zul => &*STOPWORDS_ZUL,
Lang::Kin => &*STOPWORDS_KIN,
Lang::Hat => &*STOPWORDS_HAT,
Lang::Ilo => &*STOPWORDS_ILO,
Lang::Run => &*STOPWORDS_RUN,
Lang::Sna => &*STOPWORDS_SNA,
Lang::Uig => &*STOPWORDS_UIG,
Lang::Afr => &*STOPWORDS_AFR,
Lang::Lat => &*STOPWORDS_LAT,
Lang::Slk => &*STOPWORDS_SLK,
Expand All @@ -311,38 +263,22 @@ impl LexerStopWord {
Lang::Ita,
Lang::Tur,
Lang::Pol,
Lang::Orm,
Lang::Ron,
Lang::Hau,
Lang::Hrv,
Lang::Nld,
Lang::Kur,
Lang::Yor,
Lang::Uzb,
Lang::Ibo,
Lang::Ceb,
Lang::Tgl,
Lang::Hun,
Lang::Azj,
Lang::Aze,
Lang::Ces,
Lang::Mlg,
Lang::Nya,
Lang::Kin,
Lang::Zul,
Lang::Swe,
Lang::Som,
Lang::Ilo,
Lang::Uig,
Lang::Hat,
Lang::Aka,
Lang::Sna,
Lang::Afr,
Lang::Fin,
Lang::Run,
Lang::Tuk,
Lang::Dan,
Lang::Nob,
Lang::Nno,
Lang::Lit,
Lang::Slv,
Lang::Epo,
Expand All @@ -356,16 +292,16 @@ impl LexerStopWord {
Lang::Rus,
Lang::Ukr,
Lang::Srp,
Lang::Azj,
Lang::Aze,
Lang::Bel,
Lang::Bul,
Lang::Tuk,
Lang::Mkd,
],
Script::Arabic => &[Lang::Ara, Lang::Urd, Lang::Skr, Lang::Uig, Lang::Pes],
Script::Devanagari => &[Lang::Hin, Lang::Mar, Lang::Mai, Lang::Bho, Lang::Nep],
Script::Ethiopic => &[Lang::Amh, Lang::Tir],
Script::Hebrew => &[Lang::Heb, Lang::Ydd],
Script::Arabic => &[Lang::Ara, Lang::Urd, Lang::Pes],
Script::Devanagari => &[Lang::Hin, Lang::Mar, Lang::Nep],
Script::Ethiopic => &[Lang::Amh],
Script::Hebrew => &[Lang::Heb, Lang::Yid],
Script::Mandarin => &[Lang::Cmn],
Script::Bengali => &[Lang::Ben],
Script::Hangul => &[Lang::Kor],
Expand Down
2 changes: 1 addition & 1 deletion src/stopwords/azj.rs → src/stopwords/aze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// Copyright: 2019, Valerian Saliou <valerian@valeriansaliou.name>
// License: Mozilla Public License v2.0 (MPL v2.0)

pub static STOPWORDS_AZJ: &[&'static str] = &[
pub static STOPWORDS_AZE: &[&'static str] = &[
"a",
"ad",
"altı",
Expand Down
8 changes: 0 additions & 8 deletions src/stopwords/bho.rs

This file was deleted.

8 changes: 0 additions & 8 deletions src/stopwords/ceb.rs

This file was deleted.

8 changes: 0 additions & 8 deletions src/stopwords/hat.rs

This file was deleted.

11 changes: 0 additions & 11 deletions src/stopwords/hau.rs

This file was deleted.

0 comments on commit 940d3c3

Please sign in to comment.