hspell.pl

#!/usr/bin/perl -w
# Copyright (C) 2002-2003 Nadav Har'El and Dan Kenigsberg
use Carp;
use strict;
use Getopt::Long;
use IO::File;

my $VERSION="0.5";

# process command line options:
# -v: verbose (shows derivation of accepted words)
# -c: correct (suggests corrections)
# -s: severity (sorts wrong words in order of number of appearances).
# -n: notes (gives longer notes explaining selected spelling errors).
# -a: for use in pipe (a la ispell's -a).
# -i: use slave ispell

# Set the defaults for the options
my %opts;

# Allow bundling of the short options, e.g., -nc
Getopt::Long::Configure ("bundling");

sub help(){
	print "hspell - Hebrew spellchecker\n".
	      "Usage: $0 [-acinsv] [file...]\n\n".
	      "See hspell(1) manual for a description of hspell and its ".
	      "options.\n";
}

#if(!getopts('vcsnaid:Bm', \%opts)){
if(!GetOptions('correct|c'  => \$opts{c},
               'notes|n'    => \$opts{n},
               'severity|s' => \$opts{s},
               'verbose|v'  => \$opts{v},
               'ispell-slave|i'  => \$opts{i},
               'a'  => \$opts{a},
               'd=s'  => \$opts{d},
               'B'  => \$opts{B},
               'm'  => \$opts{m},
               'version|V'  => sub {print "Hspell $VERSION\nWritten by Nadav ".
	       		      "Har'El and Dan Kenigsberg\n"; exit(0)},
               'help|h'     => sub {help(); exit(0)},
)){
	&help();
	exit(1);
}

if (defined($opts{d}) || defined($opts{B}) || defined($opts{m})) {
	print STDERR "Warning: ispell options -d, -B and -m ".
		     "are ignored by hspell.\n";
}
my $verbose=defined($opts{v});
my $correct=defined($opts{c});
my $severity=defined($opts{s});
my $shownotes=defined($opts{n});
my $interpipe=defined($opts{a}) || ($0 =~ m{-i});
my $slave=defined($opts{i}) || ($0 =~ m{-i});

my $strict_smichut=0;

my @dictionaries=("out.nouns","out.nouns-shemp","out.verbs","milot","extrawords","biza-verbs", "biza-nouns");
#my @dictionaries=("zcat wordlist.wgz|wunzip|");
my @likelyerror_dictionaries=("likelyerrors");
my @spellinghints_dictionaries=();
@spellinghints_dictionaries=("spellinghints") if $shownotes;

# If it exists, use ~/.hspell_words. This is sort of an ad-hoc feature,
# because words in that list do not automatically get inflected.
my @optional_dictionaries=($ENV{HOME}."/.hspell_words","hspell_words");
push @dictionaries, grep {-f $_} @optional_dictionaries;

my $dict;
my %dictionary;
my %likelyerrors;
# read dictionaries
foreach $dict (@dictionaries) {
	my $F = new IO::File;
	my $save=""; # used for verbose mode
	$F->open($dict) or croak "Couldn't open dictionary file $dict";
	$save="xxx" if $dict eq "extrawords"; # ad-hoc, sign for file without stems
	# The speed of the following loop has a great effect on startup time,
	# so we want the inner loop to be as quick as possible! When the
	# various if's were inside the inner loop start up time took (with
	# around 100,000 words) about 5.5 seconds. With the tight loop, it
	# takes 3.4 seconds.
	# This can be further droped to 2.5 seconds if we could remove the
	# s/-$//o command! (e.g., if we're sure the dictionary files doesn't
	# contain those useless (when !$strict_smichut) smichut characters).
	if(!$verbose && !$strict_smichut){
		while(<$F>){
			if(/^[à-úA-Z]/o){
				chomp;
				s/-$//o;
				$dictionary{$_}=1;
			}
		}
 	} else {
		while(<$F>){
			chomp;
			if(/^[-#]/o){
				# ignore comments, and ---- seperators
				$save=""; # used for verbose mode
			} else {
				s/-//o if(!$strict_smichut);
				if($verbose){
					# tell the user where the word was found...
					if($save eq ""){
						$save=$_;
						$save =~ s/^L/ì/o;
					}
					my $s;
					if($save eq "xxx"){
						$s=$dict;
					} else {
						$s="$dict:$save";
					}
					if(exists($dictionary{$_})){
						# ignore double matches
						next if($dictionary{$_} =~ m/$s(,|$)/);
						$dictionary{$_}=$dictionary{$_}.", ".$s;
					} else {
						$dictionary{$_}=$s;
					}
				} else {
					$dictionary{$_}=1;
				}
			}
		}
	}
}

# If we add the empty word to the dictionaries valid prefixes with no
# word after them get accepted. This is useful for when a valid prefix
# (ä, ëù, etc.) get followed by a number or a non-Hebrew word (usually
# separated by a makaf).
$dictionary{""}=1;

foreach $dict (@likelyerror_dictionaries) {
	my $F = new IO::File;
	$F->open($dict) or croak "Couldn't open dictionary file $dict";
	while(<$F>){
		chomp;
		if(/^\s*#|^\s*$|^-*$/o){
			# ignore comments, white lines and ---- seperators.
		} else {
			$likelyerrors{$_}=1; # TODO: maybe in the future use values
		}
	}
}

my %spellinghints;
foreach $dict (@spellinghints_dictionaries) {
	my $F = new IO::File;
	$F->open($dict) or croak "Couldn't open spelling hints file $dict";
	my $desc="";
	my $lastdesc;
	while(<$F>){
		chomp;
		if(/^\s*#|^\s*$|^-*$/o){
			# ignore comments, white lines and ---- seperators.
		} elsif(/^\+/o){
			# + lines add to the description
			$desc=$desc." ".substr($_,1)."\n";
		} else {
			# word lines use that description
			if($desc ne ""){
				$lastdesc=$desc;
				$desc="";
			}
			$spellinghints{$_}=$lastdesc;
		}
	}
}


my %wrongwords;
my %warnwords;

my @prefixes = (
	"",
	"î","ù","ä","å","ë","ì","á",
	"ùî", "äî", "åî", "ìî",
	"îù", "åù", "ëù", "ìù",
	"îä", "ùä", "åä",
	"îë", "ùë", "äë", "åë", "ìë", "áë",
	"ùì", "äì", "åì",
	"ùá", "äá", "åá", "ëá",
	"îá", # ex: âãåì îáòáø, îáùðú ...
	"ìëù", "åìëù", # ex: ìëùúâãì, ...
	"åùî","åîù","åëù","åìù","åîä","åùä","åùä","åîë",
	"åùë","åìë","åáë","åùì","åäì","åùá","åëá","ùîä",
	"ëùä","ëùî","ëùì","ëùá",
	"ëùîä", "åùîä", "åëùî", "åëùì", "åëùá","åëùä",
	"åëùîä",
        "ùëù","ùëùä","ùëùî","ùëùá","ùëùì","ùëùîä",
);

sub check_word {
	my $word = shift;
	# ignore empty words
	return 1 if $word =~ m/^[-'" ]*$/o;
	my ($prefix,$plen);
	foreach $prefix (@prefixes){
		$plen=length($prefix);
		if((substr($word,0,length($prefix)) eq $prefix)){
			# ad-hoc trick: eat up " if necessary, to recognize
			# stuff like á"ùèéç", ä"öååàø", etc.
			if(length($word) > length($prefix) &&
		 	        substr($word,length($prefix),1) eq '"'){
				$plen++;
			}
			# The first UGLY if() here is the academia's ktiv male
			# rule of doubling a vav (not yud!) starting a word,
			# unless it's already next to a vav.
			# The "elsif" check below is the normal case.
			if($prefix ne "" &&
			   substr($word,$plen,1) eq 'å' &&
			   substr($prefix,-1,1) ne 'å'){
				if(substr($word,$plen+1,1) eq 'å'){
					if(substr($word,$plen+2,1) ne 'å' &&
					   exists ($dictionary{substr($word,$plen+1)})){
						if($verbose){
							print "found $word: prefix '$prefix' doubled 'å' stem $dictionary{substr($word,$plen+1)}\n";
						}
						if(exists($likelyerrors{substr($word,$plen+1)})){
							return 2+$plen+1;
						} else {
							return 1;
						}
					} elsif(exists ($dictionary{substr($word,$plen)})){
						if($verbose){
							print "found $word: prefix '$prefix' (nondoubled 'å') stem $dictionary{substr($word,$plen)}\n";
						}
						if(exists($likelyerrors{substr($word,$plen)})){
							return 2+$plen+1;
						} else {
							return 1;
						}
					}
				}
			# the normal check for word minus the prefix:
			} elsif(exists ($dictionary{substr($word,$plen)})){
				if($verbose){
					print "found $word: prefix '$prefix' stem $dictionary{substr($word,$plen)}\n";
				}
				if(exists($likelyerrors{substr($word,$plen)})){
					return 2+$plen;
				} else {
					return 1;
				}
# 			adding gimatria check here slows things down, and
#                       worse: adds a lot of weird "corrections" because
#			trycorrect calls check_word with an extra " before
#                       the last letter, to check for acronyms....
#			} elsif($word=~/['"]/o && &is_canonic_gimatria($word)){
#				if($verbose){
#					print "found $word: canonic gimatria\n";
#				}
#				return 1;
			# ad-interim trick to check for bachlam+maqor: if a
			# word Làáãëí in the dictionary (as generated by 'woo')
			# we allow any of bachlam to replace the L
			} elsif(($prefix =~ m/[áëìî]$/o) &&
				exists ($dictionary{"L".substr($word,$plen)})){
				if($verbose){
					print "found $word: prefix '$prefix' L-form of stem $dictionary{'L'.substr($word,$plen)}\n";
				}
				if(exists($likelyerrors{substr($word,$plen-1)})){
					return 2+$plen-1;
				} else {
					return 1;
				}
			# ad-interim trick to check for b+maqor: if a
			# word Bùáúå in the dictionary (as generated by 'woo')
			# we allow most á prefixes to replace the B
			} elsif(($prefix =~ m/^å?ù?á$/o) &&
				exists ($dictionary{"B".substr($word,$plen)})){
				if($verbose){
					print "found $word: prefix '$prefix' B-form of stem $dictionary{'B'.substr($word,$plen)}\n";
				}
				if(exists($likelyerrors{substr($word,$plen-1)})){
					return 2+$plen-1;
				} else {
					return 1;
				}
			}
		}
	}
	return 0;
}

# ad-hoc attempt to find corrections for word
sub trycorrect {
	my $word = shift;
	my @results;
	my $i;
	# try to add a missing em kri'a - yud or vav
	for($i=1;$i<length($word);$i++){
		my $w=$word;
		substr($w,$i,1)='é'.substr($w,$i,1);
		if(check_word($w)==1){
			push @results,$w if not grep(m/$w/,@results);
		}
		$w=$word;
		substr($w,$i,1)='å'.substr($w,$i,1);
		if(check_word($w)==1){
			push @results,$w if not grep(m/$w/,@results);
		}
	}
	# try to remove an em kri'a - yud or vav
	for($i=0;$i<length($word);$i++){
		my $w=$word;
		if(substr($w,$i,1) eq 'é' || substr($w,$i,1) eq 'å'){
			substr($w,$i,1)='';
			if(length($w)>0 && check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
	}
	# try to add or remove an aleph (is that useful?)
	for($i=1;$i<length($word);$i++){
		my $w=$word;
		substr($w,$i,1)='à'.substr($w,$i,1);
		if(check_word($w)==1){
			push @results,$w if not grep(m/$w/,@results);
		}
	}
	for($i=0;$i<length($word);$i++){
		my $w=$word;
		if(substr($w,$i,1) eq 'à'){
			substr($w,$i,1)='';
			if(length($w)>0 && check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
	}
	# try to replace similarly sounding (for certain people) letters:
	#    àòä äâ çë èú ñö ñù ë÷ áåå áô
	for($i=0;$i<length($word);$i++){
		my $w;
		if(substr($word,$i,1) eq 'à' || substr($word,$i,1) eq 'ò' ||
		   substr($word,$i,1) eq 'â'){
		   	$w=$word; substr($w,$i,1)='ä';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'à' || substr($word,$i,1) eq 'ä'){
		   	$w=$word; substr($w,$i,1)='ò';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ò' || substr($word,$i,1) eq 'ä'){
		   	$w=$word; substr($w,$i,1)='à';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ä'){
		   	$w=$word; substr($w,$i,1)='â';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ë'){
		   	$w=$word; substr($w,$i,1)='ç';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ç'){
		   	$w=$word; substr($w,$i,1)='ë';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'è'){
		   	$w=$word; substr($w,$i,1)='ú';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ú'){
		   	$w=$word; substr($w,$i,1)='è';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ë'){
		   	$w=$word; substr($w,$i,1)='÷';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq '÷'){
		   	$w=$word; substr($w,$i,1)='ë';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'á'){
		   	$w=$word; substr($w,$i,1)='ô';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'á'){
		   	$w=$word; substr($w,$i,1)='åå';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ô' || substr($word,$i,1) eq 'å'){
		   	$w=$word; substr($w,$i,1)='á';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,2) eq 'åå'){
		   	$w=$word; substr($w,$i,2)='á';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ö' || substr($word,$i,1) eq 'ù'){
		   	$w=$word; substr($w,$i,1)='ñ';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ñ'){
		   	$w=$word; substr($w,$i,1)='ö';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
		if(substr($word,$i,1) eq 'ñ'){
		   	$w=$word; substr($w,$i,1)='ù';
			if(check_word($w)==1){
				push @results,$w if not grep(m/$w/,@results);
			}
		}
	}
	# try to replace a non-final letter at the end of the word by its
	# final form and vice versa (useful check for abbreviations):
	if(substr($word,-1,1) =~ /[êïíõôëðîöô]/){
		my $w=substr($word,0,-1);
		my $e=substr($word,-1,1);
		$e =~ tr/êïíõôëðîöô/ëðîöôêïíõô/;
		$w=$w.$e;
		if(check_word($w)==1){
			push @results,$w if not grep(m/$w/,@results);
		}
	}
	# try to make the word into an acronym (add " before last character)
	if(length($word)>=2){
		my $w=substr($word,0,-1);
		my $e=substr($word,-1,1);
		$w=$w.'"'.$e;
		if(check_word($w)==1){
			push @results,$w if not grep(m/$w/,@results);
		}
	}
	# try to make the word into an abbreviation (add ' at the end)
	my $w=$word."'";
	if(check_word($w)==1){
		push @results,$w if not grep(m/$w/,@results);
	}
#	# try to remove any letter
#	for($i=0;$i<length($word);$i++){
#		my $w=$word;
#		substr($w,$i,1)='';
#		if(check_word($w)==1){
#			push @results,$w if not grep(m/$w/,@results);
#		}
#	}
#	# try to add any letter (warning: very slow, maybe should be an option)
#	for($i=0;$i<length($word);$i++){
#		my $letter;
#		for($letter=ord('à'); $letter<ord('ú'); $letter++){
#			my $w=$word;
#			substr($w,$i,1)=chr($letter).substr($w,$i,1);
#			if(check_word($w)==1){
#				push @results,$w if not grep(m/$w/,@results);
#			}
#		}
#	}
	return join(", ",@results);

}

### A function for checking for valid gimatria:
sub is_canonic_gimatria {
  my $s = shift;
  return &int2gim(&gim2int($s)) eq $s;
}
sub gim2int {
  my $gim = shift;
  my $n = 0;
  my %gim2int = ('à'=>1,'á'=>2,'â'=>3,'ã'=>4,'ä'=>5,'å'=>6,'æ'=>7,'ç'=>8,'è'=>9,
       'é'=>10,'ë'=>20,'ê'=>20,'ì'=>30,'î'=>40,'í'=>40,'ð'=>50,'ï'=>50,
       'ñ'=>60,'ò'=>70,'ô'=>80,'ó'=>80,'ö'=>90,'õ'=>90,'÷'=>100,'ø'=>200,
       'ù'=>300,'ú'=>400,'"'=>0);
  my ($chnk, $c);

  foreach $chnk (split "'", $gim) {
    $n *= 1000;
    foreach $c (split //, $chnk) {
      $n += $gim2int{$c};
    }
  }
  return $n;
}
sub int2gim {
  my $n = shift;
  my $gim = "";
  return undef if $n <= 0;
  my $tmp = &_aux_ig($n);
  return $gim.$tmp."'" if $tmp =~ m/(^|').$/;
  $tmp =~ s/([^'])$/\"$1/o;
  $tmp =~ s/ë$/ê/o;
  $tmp =~ s/î$/í/o;
  $tmp =~ s/ð$/ï/o;
  $tmp =~ s/ô$/ó/o;
  $tmp =~ s/ö$/õ/o;
  return $gim.$tmp;
}
sub _aux_ig {
  my $n = shift;
  my ($gim, $val) = ("", 0);
  my %int2gim = (1=>'à',2=>'á',3=>'â',4=>'ã',5=>'ä',6=>'å',7=>'æ',8=>'ç',
      9=>'è',10=>'é',15=>'èå',16=>'èæ',17=>'éæ',18=>'éç',19=>'éè',20=>'ë',
      30=>'ì',40=>'î',50=>'ð',60=>'ñ',70=>'ò', 80=>'ô',90=>'ö',100=>'÷',
      200=>'ø',300=>'ù',400=>'ú');
  my @vals = sort { $b <=> $a } keys %int2gim;

  if ($n >= 1000) {
    $gim = &_aux_ig(($n - $n%1000)/1000)."'";
    $n = $n % 1000;
  }
  foreach $val (@vals) {
    while ($n >= $val) {
      $gim .= $int2gim{$val};
      $n -= $val;
    }
  }
  return $gim;
}
###########


# spell-check the input files
if ($interpipe) {
	if ($slave) {
		# TODO: pass -d -B and -m to ispell
		my $args="-a";
		open(ISPELL,"|ispell $args");
		autoflush ISPELL 1;
	} else {
		print "@(#) International Ispell Version 3.1.20 (but really Hspell $VERSION)\n";
	}
}
$| = 1 if $interpipe;
my ($res, $line, $offset);
while(<>){
	#if ($interpipe && m/^[#!~^%-+&*]/) { #ispell command lines
	if ($interpipe && m/^[#!~@%\-+&*]/) { #ispell command lines
		print ISPELL if $slave;
		next;
	}
	chomp;
	$line = $_;
	$offset = -1;
	# convert a literal "&#1470;" (HTML makaf) into -
	s/&#1470;/-/go;
	my @array;
	if($strict_smichut){
		@array=split(/[^à-ú'"-]+|(-)/o);
	} else {
		@array=split(/[^à-ú'"]+/o);
	}
	my ($word, $word1, $word2);
	while (@array){
		if($strict_smichut){
			$word1=shift(@array);
			$word2=shift(@array); # contains a - or nothing
			if(defined($word2)){
				$word=$word1.$word2;
			} else {
				$word=$word1;
			}
		} else {
			$word=shift(@array);
		}
		$offset=index($line,$word,$offset+1) if $interpipe;
		# convert two single quotes ('') into one double quote (").
		# For TeX junkies.
		$word =~ s/''/"/go;
		# remove quotes from end or beginning of the word (we do
		# leave, however, single quotes in the middle of the word -
		# used to signify "j" sound in Hebrew, for example, and double
		# quotes used to signify acronyms. A single quote at the end
		# of the word is used to signify an abbreviate - or can be
		# an actual quote (there is no difference in ASCII...), so we
		# must check both possibilities.
		$word =~ s/^['"]//o;
		$word =~ s/"$//o;
		$res=check_word($word);
		if($res!=1 && $word =~ /['"]/o){
			# maybe it's not a word, but rather gimatria?
			if(is_canonic_gimatria($word)){
				if($verbose){
					print "found $word: canonic gimatria\n";
				}
				$res=1;
			}
		}
		if($res!=1 && $word =~ /'$/o){
			# try again, without the quote...
			$word =~ s/'$//o;
			$res=check_word($word);
		}
		if($res==0){
			#$wrongwords{$word}=1
			$wrongwords{$word}++;
			print "? $word 0 $offset: ".trycorrect($word)."\n" if $interpipe;
		} elsif($res>1){
			$warnwords{substr($word,$res-2)}=1;
		}
	}
	if ($interpipe) {
		if ($slave) {
			s/[à-ú]/ /og;
			print ISPELL "$_\n";
			next;
		} else {
			print "\n";
		}
        }
}

exit 0 if $interpipe;

my $word;
# list wrong words.
if(%wrongwords){
	if($correct){
		print "ùâéàåú ëúéá ùðîöàå, åúé÷åðéäï äîåîìöéí:\n\n";
	} else {
		#print "wrong words:\n";
		print "ùâéàåú ëúéá ùðîöàå:\n\n";
	}
	my @badwords;
	if($severity){
		# sort according to severity (repeating incorrect words)
		@badwords = sort {$wrongwords{$b} <=> $wrongwords{$a}} keys %wrongwords;
	} else {
		# alphabetical sorting of the word
		@badwords = sort(keys %wrongwords);
	}
	foreach $word (@badwords){
		if($correct){
			# TODO: maybe enable printing the number just when
			# $severity is on? Or is it useful always?
			#print $word."  ->  ".trycorrect($word)."\n";
			print $wrongwords{$word}." ".$word."  ->  ".trycorrect($word)."\n";
		} else {
			print $word."\n";
		}
		if($shownotes && exists($spellinghints{$word})){
			print $spellinghints{$word};
		}
	}
}
if(%warnwords){
	if($correct){
		print "\nîéìéí ðãéøåú ùäï ùâéàåú ëúéá ðôåöåú:\n\n";
	} else {
		#print "rare correct words that are common mispellings:\n";
		print "\nîéìéí ðãéøåú ùäï ùâéàåú ëúéá ðôåöåú:\n\n";
	}
	foreach $word (sort(keys %warnwords)){
		if($correct){
			print $word."  ->  ".trycorrect($word)."\n";
		} else {
			print $word."\n";
		}
		if($shownotes && exists($spellinghints{$word})){
			print $spellinghints{$word};
		}
	}
}