Permalink
Fetching contributors…
Cannot retrieve contributors at this time
executable file 698 lines (670 sloc) 18.9 KB
#!/usr/bin/perl -w
# Copyright (C) 2002-2003 Nadav Har'El and Dan Kenigsberg
use Carp;
use strict;
use Getopt::Long;
use IO::File;
my $VERSION="0.5";
# process command line options:
# -v: verbose (shows derivation of accepted words)
# -c: correct (suggests corrections)
# -s: severity (sorts wrong words in order of number of appearances).
# -n: notes (gives longer notes explaining selected spelling errors).
# -a: for use in pipe (a la ispell's -a).
# -i: use slave ispell
# Set the defaults for the options
my %opts;
# Allow bundling of the short options, e.g., -nc
Getopt::Long::Configure ("bundling");
sub help(){
print "hspell - Hebrew spellchecker\n".
"Usage: $0 [-acinsv] [file...]\n\n".
"See hspell(1) manual for a description of hspell and its ".
"options.\n";
}
#if(!getopts('vcsnaid:Bm', \%opts)){
if(!GetOptions('correct|c' => \$opts{c},
'notes|n' => \$opts{n},
'severity|s' => \$opts{s},
'verbose|v' => \$opts{v},
'ispell-slave|i' => \$opts{i},
'a' => \$opts{a},
'd=s' => \$opts{d},
'B' => \$opts{B},
'm' => \$opts{m},
'version|V' => sub {print "Hspell $VERSION\nWritten by Nadav ".
"Har'El and Dan Kenigsberg\n"; exit(0)},
'help|h' => sub {help(); exit(0)},
)){
&help();
exit(1);
}
if (defined($opts{d}) || defined($opts{B}) || defined($opts{m})) {
print STDERR "Warning: ispell options -d, -B and -m ".
"are ignored by hspell.\n";
}
my $verbose=defined($opts{v});
my $correct=defined($opts{c});
my $severity=defined($opts{s});
my $shownotes=defined($opts{n});
my $interpipe=defined($opts{a}) || ($0 =~ m{-i});
my $slave=defined($opts{i}) || ($0 =~ m{-i});
my $strict_smichut=0;
my @dictionaries=("out.nouns","out.nouns-shemp","out.verbs","milot","extrawords","biza-verbs", "biza-nouns");
#my @dictionaries=("zcat wordlist.wgz|wunzip|");
my @likelyerror_dictionaries=("likelyerrors");
my @spellinghints_dictionaries=();
@spellinghints_dictionaries=("spellinghints") if $shownotes;
# If it exists, use ~/.hspell_words. This is sort of an ad-hoc feature,
# because words in that list do not automatically get inflected.
my @optional_dictionaries=($ENV{HOME}."/.hspell_words","hspell_words");
push @dictionaries, grep {-f $_} @optional_dictionaries;
my $dict;
my %dictionary;
my %likelyerrors;
# read dictionaries
foreach $dict (@dictionaries) {
my $F = new IO::File;
my $save=""; # used for verbose mode
$F->open($dict) or croak "Couldn't open dictionary file $dict";
$save="xxx" if $dict eq "extrawords"; # ad-hoc, sign for file without stems
# The speed of the following loop has a great effect on startup time,
# so we want the inner loop to be as quick as possible! When the
# various if's were inside the inner loop start up time took (with
# around 100,000 words) about 5.5 seconds. With the tight loop, it
# takes 3.4 seconds.
# This can be further droped to 2.5 seconds if we could remove the
# s/-$//o command! (e.g., if we're sure the dictionary files doesn't
# contain those useless (when !$strict_smichut) smichut characters).
if(!$verbose && !$strict_smichut){
while(<$F>){
if(/^[à-úA-Z]/o){
chomp;
# s/-$//o;
# changed to ignore extra descriptions in
# extrawords
s/[- ].*$//o;
$dictionary{$_}=1;
}
}
} else {
while(<$F>){
chomp;
if(/^[-#]/o){
# ignore comments, and ---- seperators
$save=""; # used for verbose mode
} else {
s/-//o if(!$strict_smichut);
if($verbose){
# tell the user where the word was found...
if($save eq ""){
$save=$_;
$save =~ s/^L/ì/o;
}
my $s;
if($save eq "xxx"){
$s=$dict;
} else {
$s="$dict:$save";
}
if(exists($dictionary{$_})){
# ignore double matches
next if($dictionary{$_} =~ m/$s(,|$)/);
$dictionary{$_}=$dictionary{$_}.", ".$s;
} else {
$dictionary{$_}=$s;
}
} else {
$dictionary{$_}=1;
}
}
}
}
}
# If we add the empty word to the dictionaries valid prefixes with no
# word after them get accepted. This is useful for when a valid prefix
# (ä, ëù, etc.) get followed by a number or a non-Hebrew word (usually
# separated by a makaf).
$dictionary{""}=1;
foreach $dict (@likelyerror_dictionaries) {
my $F = new IO::File;
$F->open($dict) or croak "Couldn't open dictionary file $dict";
while(<$F>){
chomp;
if(/^\s*#|^\s*$|^-*$/o){
# ignore comments, white lines and ---- seperators.
} else {
$likelyerrors{$_}=1; # TODO: maybe in the future use values
}
}
}
my %spellinghints;
foreach $dict (@spellinghints_dictionaries) {
my $F = new IO::File;
$F->open($dict) or croak "Couldn't open spelling hints file $dict";
my $desc="";
my $lastdesc;
while(<$F>){
chomp;
if(/^\s*#|^\s*$|^-*$/o){
# ignore comments, white lines and ---- seperators.
} elsif(/^\+/o){
# + lines add to the description
$desc=$desc." ".substr($_,1)."\n";
} else {
# word lines use that description
if($desc ne ""){
$lastdesc=$desc;
$desc="";
}
$spellinghints{$_}=$lastdesc;
}
}
}
my %wrongwords;
my %warnwords;
my @prefixes = (
"",
"î","ù","ä","å","ë","ì","á",
"ùî", "äî", "åî", "ìî",
"îù", "åù", "ëù", "ìù",
"îä", "ùä", "åä",
"îë", "ùë", "äë", "åë", "ìë", "áë",
"ùì", "äì", "åì",
"ùá", "äá", "åá", "ëá",
"îá", # ex: âãåì îáòáø, îáùðú ...
"ìëù", "åìëù", # ex: ìëùúâãì, ...
"åùî","åîù","åëù","åìù","åîä","åùä","åîë",
"åùë","åìë","åáë","åùì","åäì","åùá","åëá","ùîä",
"ëùä","ëùî","ëùì","ëùá",
"ëùîä", "åùîä", "åëùî", "åëùì", "åëùá","åëùä",
"åëùîä",
"ùëù","ùëùä","ùëùî","ùëùá","ùëùì","ùëùîä",
);
sub check_word {
my $word = shift;
# ignore empty words
return 1 if $word =~ m/^[-'" ]*$/o;
my ($prefix,$plen);
foreach $prefix (@prefixes){
$plen=length($prefix);
if((substr($word,0,length($prefix)) eq $prefix)){
# ad-hoc trick: eat up " if necessary, to recognize
# stuff like á"ùèéç", ä"öååàø", etc.
if(length($word) > length($prefix) &&
substr($word,length($prefix),1) eq '"'){
$plen++;
}
# The first UGLY if() here is the academia's ktiv male
# rule of doubling a vav (not yud!) starting a word,
# unless it's already next to a vav.
# The "elsif" check below is the normal case.
if($prefix ne "" &&
substr($word,$plen,1) eq 'å' &&
substr($prefix,-1,1) ne 'å'){
if(substr($word,$plen+1,1) eq 'å'){
if(substr($word,$plen+2,1) ne 'å' &&
exists ($dictionary{substr($word,$plen+1)})){
if($verbose){
print "found $word: prefix '$prefix' doubled 'å' stem $dictionary{substr($word,$plen+1)}\n";
}
if(exists($likelyerrors{substr($word,$plen+1)})){
return 2+$plen+1;
} else {
return 1;
}
} elsif(exists ($dictionary{substr($word,$plen)})){
if($verbose){
print "found $word: prefix '$prefix' (nondoubled 'å') stem $dictionary{substr($word,$plen)}\n";
}
if(exists($likelyerrors{substr($word,$plen)})){
return 2+$plen+1;
} else {
return 1;
}
}
}
# the normal check for word minus the prefix:
} elsif(exists ($dictionary{substr($word,$plen)})){
if($verbose){
print "found $word: prefix '$prefix' stem $dictionary{substr($word,$plen)}\n";
}
if(exists($likelyerrors{substr($word,$plen)})){
return 2+$plen;
} else {
return 1;
}
# adding gimatria check here slows things down, and
# worse: adds a lot of weird "corrections" because
# trycorrect calls check_word with an extra " before
# the last letter, to check for acronyms....
# } elsif($word=~/['"]/o && &is_canonic_gimatria($word)){
# if($verbose){
# print "found $word: canonic gimatria\n";
# }
# return 1;
# ad-interim trick to check for bachlam+maqor: if a
# word Làáãëí in the dictionary (as generated by 'woo')
# we allow any of bachlam to replace the L
} elsif(($prefix =~ m/[áëìî]$/o) &&
exists ($dictionary{"L".substr($word,$plen)})){
if($verbose){
print "found $word: prefix '$prefix' L-form of stem $dictionary{'L'.substr($word,$plen)}\n";
}
if(exists($likelyerrors{substr($word,$plen-1)})){
return 2+$plen-1;
} else {
return 1;
}
# ad-interim trick to check for b+maqor: if a
# word Bùáúå in the dictionary (as generated by 'woo')
# we allow most á prefixes to replace the B
} elsif(($prefix =~ m/^å?ù?á$/o) &&
exists ($dictionary{"B".substr($word,$plen)})){
if($verbose){
print "found $word: prefix '$prefix' B-form of stem $dictionary{'B'.substr($word,$plen)}\n";
}
if(exists($likelyerrors{substr($word,$plen-1)})){
return 2+$plen-1;
} else {
return 1;
}
}
}
}
return 0;
}
# ad-hoc attempt to find corrections for word
sub trycorrect {
my $word = shift;
my @results;
my $i;
# try to add a missing em kri'a - yud or vav
for($i=1;$i<length($word);$i++){
my $w=$word;
substr($w,$i,1)='é'.substr($w,$i,1);
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
$w=$word;
substr($w,$i,1)='å'.substr($w,$i,1);
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
# try to remove an em kri'a - yud or vav
for($i=0;$i<length($word);$i++){
my $w=$word;
if(substr($w,$i,1) eq 'é' || substr($w,$i,1) eq 'å'){
substr($w,$i,1)='';
if(length($w)>0 && check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
}
# try to add or remove an aleph (is that useful?)
for($i=1;$i<length($word);$i++){
my $w=$word;
substr($w,$i,1)='à'.substr($w,$i,1);
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
for($i=0;$i<length($word);$i++){
my $w=$word;
if(substr($w,$i,1) eq 'à'){
substr($w,$i,1)='';
if(length($w)>0 && check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
}
# try to replace similarly sounding (for certain people) letters:
# àòä äâ çë èú ñö ñù ë÷ áåå áô
for($i=0;$i<length($word);$i++){
my $w;
if(substr($word,$i,1) eq 'à' || substr($word,$i,1) eq 'ò' ||
substr($word,$i,1) eq 'â'){
$w=$word; substr($w,$i,1)='ä';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'à' || substr($word,$i,1) eq 'ä'){
$w=$word; substr($w,$i,1)='ò';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ò' || substr($word,$i,1) eq 'ä'){
$w=$word; substr($w,$i,1)='à';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ä'){
$w=$word; substr($w,$i,1)='â';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ë'){
$w=$word; substr($w,$i,1)='ç';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ç'){
$w=$word; substr($w,$i,1)='ë';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'è'){
$w=$word; substr($w,$i,1)='ú';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ú'){
$w=$word; substr($w,$i,1)='è';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ë'){
$w=$word; substr($w,$i,1)='÷';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq '÷'){
$w=$word; substr($w,$i,1)='ë';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'á'){
$w=$word; substr($w,$i,1)='ô';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'á'){
$w=$word; substr($w,$i,1)='åå';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ô' || substr($word,$i,1) eq 'å'){
$w=$word; substr($w,$i,1)='á';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,2) eq 'åå'){
$w=$word; substr($w,$i,2)='á';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ö' || substr($word,$i,1) eq 'ù'){
$w=$word; substr($w,$i,1)='ñ';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ñ'){
$w=$word; substr($w,$i,1)='ö';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
if(substr($word,$i,1) eq 'ñ'){
$w=$word; substr($w,$i,1)='ù';
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
}
# try to replace a non-final letter at the end of the word by its
# final form and vice versa (useful check for abbreviations):
if(substr($word,-1,1) =~ /[êïíõôëðîöô]/){
my $w=substr($word,0,-1);
my $e=substr($word,-1,1);
$e =~ tr/êïíõôëðîöô/ëðîöôêïíõô/;
$w=$w.$e;
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
# try to make the word into an acronym (add " before last character)
if(length($word)>=2){
my $w=substr($word,0,-1);
my $e=substr($word,-1,1);
$w=$w.'"'.$e;
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
}
# try to make the word into an abbreviation (add ' at the end)
my $w=$word."'";
if(check_word($w)==1){
push @results,$w if not grep(m/$w/,@results);
}
# # try to remove any letter
# for($i=0;$i<length($word);$i++){
# my $w=$word;
# substr($w,$i,1)='';
# if(check_word($w)==1){
# push @results,$w if not grep(m/$w/,@results);
# }
# }
# # try to add any letter (warning: very slow, maybe should be an option)
# for($i=0;$i<length($word);$i++){
# my $letter;
# for($letter=ord('à'); $letter<ord('ú'); $letter++){
# my $w=$word;
# substr($w,$i,1)=chr($letter).substr($w,$i,1);
# if(check_word($w)==1){
# push @results,$w if not grep(m/$w/,@results);
# }
# }
# }
return join(", ",@results);
}
### A function for checking for valid gimatria:
sub is_canonic_gimatria {
my $s = shift;
return &int2gim(&gim2int($s)) eq $s;
}
sub gim2int {
my $gim = shift;
my $n = 0;
my %gim2int = ('à'=>1,'á'=>2,'â'=>3,'ã'=>4,'ä'=>5,'å'=>6,'æ'=>7,'ç'=>8,'è'=>9,
'é'=>10,'ë'=>20,'ê'=>20,'ì'=>30,'î'=>40,'í'=>40,'ð'=>50,'ï'=>50,
'ñ'=>60,'ò'=>70,'ô'=>80,'ó'=>80,'ö'=>90,'õ'=>90,'÷'=>100,'ø'=>200,
'ù'=>300,'ú'=>400,'"'=>0);
my ($chnk, $c);
foreach $chnk (split "'", $gim) {
$n *= 1000;
foreach $c (split //, $chnk) {
$n += $gim2int{$c};
}
}
return $n;
}
sub int2gim {
my $n = shift;
my $gim = "";
return undef if $n <= 0;
my $tmp = &_aux_ig($n);
return $gim.$tmp."'" if $tmp =~ m/(^|').$/;
$tmp =~ s/([^'])$/\"$1/o;
$tmp =~ s/ë$/ê/o;
$tmp =~ s/î$/í/o;
$tmp =~ s/ð$/ï/o;
$tmp =~ s/ô$/ó/o;
$tmp =~ s/ö$/õ/o;
return $gim.$tmp;
}
sub _aux_ig {
my $n = shift;
my ($gim, $val) = ("", 0);
my %int2gim = (1=>'à',2=>'á',3=>'â',4=>'ã',5=>'ä',6=>'å',7=>'æ',8=>'ç',
9=>'è',10=>'é',15=>'èå',16=>'èæ',17=>'éæ',18=>'éç',19=>'éè',20=>'ë',
30=>'ì',40=>'î',50=>'ð',60=>'ñ',70=>'ò', 80=>'ô',90=>'ö',100=>'÷',
200=>'ø',300=>'ù',400=>'ú');
my @vals = sort { $b <=> $a } keys %int2gim;
if ($n >= 1000) {
$gim = &_aux_ig(($n - $n%1000)/1000)."'";
$n = $n % 1000;
}
foreach $val (@vals) {
while ($n >= $val) {
$gim .= $int2gim{$val};
$n -= $val;
}
}
return $gim;
}
###########
# spell-check the input files
if ($interpipe) {
if ($slave) {
# TODO: pass -d -B and -m to ispell
my $args="-a";
open(ISPELL,"|ispell $args");
autoflush ISPELL 1;
} else {
print "@(#) International Ispell Version 3.1.20 (but really Hspell $VERSION)\n";
}
}
$| = 1 if $interpipe;
my ($res, $line, $offset);
while(<>){
#if ($interpipe && m/^[#!~^%-+&*]/) { #ispell command lines
if ($interpipe && m/^[#!~@%\-+&*]/) { #ispell command lines
print ISPELL if $slave;
next;
}
chomp;
$line = $_;
$offset = -1;
# convert a literal "&#1470;" (HTML makaf) into -
s/&#1470;/-/go;
my @array;
if($strict_smichut){
@array=split(/[^à-ú'"-]+|(-)/o);
} else {
@array=split(/[^à-ú'"]+/o);
}
my ($word, $word1, $word2);
while (@array){
if($strict_smichut){
$word1=shift(@array);
$word2=shift(@array); # contains a - or nothing
if(defined($word2)){
$word=$word1.$word2;
} else {
$word=$word1;
}
} else {
$word=shift(@array);
}
$offset=index($line,$word,$offset+1) if $interpipe;
# convert two single quotes ('') into one double quote (").
# For TeX junkies.
$word =~ s/''/"/go;
# remove quotes from end or beginning of the word (we do
# leave, however, single quotes in the middle of the word -
# used to signify "j" sound in Hebrew, for example, and double
# quotes used to signify acronyms. A single quote at the end
# of the word is used to signify an abbreviate - or can be
# an actual quote (there is no difference in ASCII...), so we
# must check both possibilities.
$word =~ s/^['"]//o;
$word =~ s/"$//o;
$res=check_word($word);
if($res!=1 && $word =~ /['"]/o){
# maybe it's not a word, but rather gimatria?
if(is_canonic_gimatria($word)){
if($verbose){
print "found $word: canonic gimatria\n";
}
$res=1;
}
}
if($res!=1 && $word =~ /'$/o){
# try again, without the quote...
$word =~ s/'$//o;
$res=check_word($word);
}
if($res==0){
#$wrongwords{$word}=1
$wrongwords{$word}++;
print "? $word 0 $offset: ".trycorrect($word)."\n" if $interpipe;
} elsif($res>1){
$warnwords{substr($word,$res-2)}=1;
}
}
if ($interpipe) {
if ($slave) {
s/[à-ú]/ /og;
print ISPELL "$_\n";
next;
} else {
print "\n";
}
}
}
exit 0 if $interpipe;
my $word;
# list wrong words.
if(%wrongwords){
if($correct){
print "ùâéàåú ëúéá ùðîöàå, åúé÷åðéäï äîåîìöéí:\n\n";
} else {
#print "wrong words:\n";
print "ùâéàåú ëúéá ùðîöàå:\n\n";
}
my @badwords;
if($severity){
# sort according to severity (repeating incorrect words)
@badwords = sort {$wrongwords{$b} <=> $wrongwords{$a}} keys %wrongwords;
} else {
# alphabetical sorting of the word
@badwords = sort(keys %wrongwords);
}
foreach $word (@badwords){
if($correct){
# TODO: maybe enable printing the number just when
# $severity is on? Or is it useful always?
#print $word." -> ".trycorrect($word)."\n";
print $wrongwords{$word}." ".$word." -> ".trycorrect($word)."\n";
} else {
print $word."\n";
}
if($shownotes && exists($spellinghints{$word})){
print $spellinghints{$word};
}
}
}
if(%warnwords){
if($correct){
print "\nîéìéí ðãéøåú ùäï ùâéàåú ëúéá ðôåöåú:\n\n";
} else {
#print "rare correct words that are common mispellings:\n";
print "\nîéìéí ðãéøåú ùäï ùâéàåú ëúéá ðôåöåú:\n\n";
}
foreach $word (sort(keys %warnwords)){
if($correct){
print $word." -> ".trycorrect($word)."\n";
} else {
print $word."\n";
}
if($shownotes && exists($spellinghints{$word})){
print $spellinghints{$word};
}
}
}