Zend_Validate_Int fails for single-digit integer values in 65 different locales #166

Open
sootsnoot opened this Issue Jul 12, 2013 · 6 comments

Comments

Projects
None yet
3 participants
@sootsnoot

This is also posted at http://stackoverflow.com/questions/17600666/is-zend-validate-int-broken-for-integers-0-9-in-in-some-locales

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Locale/Format.php';
require 'Zend/Validate/Int.php';

global $mutating_locales;

echo "\n*** Showing that function checkit() does what's expected with
 a value that has locale-specific formatting\n";
checkit('1000', 'en_US');
checkit('1000', 'hi_IN');
echo var_export($mutating_locales, true) . "\n\n";

echo "\n*** Showing the problem with small integers in locale hi_IN\n";
for ($i = 0; $i < 101; ++$i) {
    checkit((string)$i, 'hi_IN');
}

echo "\n*** Checking every supported locale for value '1'\n";
$mutating_locales = array();
foreach (Zend_Locale::getLocaleList() as $locale => $ignore) {
    checkit('1', $locale);
}
echo var_export($mutating_locales, true) . "\n";

// Function to print if $value is a valid Int in locale $locale
function checkit($value, $locale) {
    global $mutating_locales;
    $formatted = Zend_Locale_Format::toInteger(
                     $value, array('locale' => $locale));
    if ($formatted !== $value) {
        printf("Representation changed for locale %s, old: '%s', new: '%s'\n",
                                             $locale,   $value, $formatted);
        $mutating_locales[] = $locale;
    }
    $validator = new Zend_Validate_Int($locale);
    $valid = $validator->isValid($value);
    if (! $valid) {
        printf("Value '%s' is not a valid Int in locale '%s'\n",
                   $value,                          $locale);
        $invalid_locales[] = $locale;
        if ($formatted === $value) {
            printf("        And the formatted value is identical\n");
        } else {
            $fmtvalid = $validator->isValid($formatted);
            if (fmtvalid) {
        printf("        But the formatted value is valid\n");
        } else {
                printf("        And the formatted value is also invalid\n");
            }
        }
    }
}

$ php validate-int-bug.php

*** Showing that function checkit() does what's expected with
 a value that has locale-specific formatting
Representation changed for locale en_US, old: '1000', new: '1,000'
Representation changed for locale hi_IN, old: '1000', new: '1,000'
array (
  0 => 'en_US',
  1 => 'hi_IN',
)


*** Showing the problem with small integers in locale hi_IN
Value '0' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '2' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '3' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '4' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '5' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '6' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '7' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '8' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '9' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical

*** Checking every supported locale for value '1'
Value '1' is not a valid Int in locale 'ar_QA'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ar_SA'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ar_SY'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ar_TN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ar_YE'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'as_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'as'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'bn_BD'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'bn_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'bn'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dv_MV'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dv'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dz_BT'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dz'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'en_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'en_PK'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'gu_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'gu'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hi'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hy_AM'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hy'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kn_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kn'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kok_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kok'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ml_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ml'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'mr_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'mr'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'or_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'or'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'pa_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'pa_PK'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'pa'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'sa_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'sa'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'bn'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dv_MV'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dv'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dz_BT'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'dz'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'en_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'en_PK'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'gu_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'gu'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hi_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hi'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hy_AM'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'hy'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kn_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kn'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kok_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'kok'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ml_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'ml'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'mr_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'mr'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'or_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'or'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'pa_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'pa_PK'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'pa'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'sa_IN'
    And the formatted value is identical
Value '1' is not a valid Int in locale 'sa'
    And the formatted value is identical
@siad007

This comment has been minimized.

Show comment Hide comment
@siad007

siad007 Jul 13, 2013

Contributor

For hi_IN latin numbers are not valid, i guess. Try their numbering system (i think they are using deva script) ०१२३४५६७८९ and it would work. Same for the other locales... (i.e. arab => ٠١٢٣٤٥٦٧٨٩ and so on...)

Contributor

siad007 commented Jul 13, 2013

For hi_IN latin numbers are not valid, i guess. Try their numbering system (i think they are using deva script) ०१२३४५६७८९ and it would work. Same for the other locales... (i.e. arab => ٠١٢٣٤٥٦٧٨٩ and so on...)

@sootsnoot

This comment has been minimized.

Show comment Hide comment
@sootsnoot

sootsnoot Jul 15, 2013

That's an interesting suggestion, and it's neat that you show the script (I copy-pasted into google translate and indeed the string is recognized as Hindi 0123456789), but I don't think it explains the behavior for the following reasons:

  1. The original test program uses Zend_Locale_Format to format the PHP integer values as strings in the hi_IN locale, and the result is unchanged (i.e. Zend_Locale_Format says that '1' is written '1' in locale hi_IN). So perhaps you could argue that Zend_Locale_Format is broken, not Zend_Validate_Int, but...
  2. The latin digits work just fine in hi_IN for values with more than one digit.
  3. I modified the test program to just this:
<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
$locale = 'hi_IN';
$value = '';  // verified with copy-paste to/from google translate: 1 in Hindi script
$validator = new Zend_Validate_Int($locale);
$valid = $validator->isValid($value);
if (! $valid) {
    printf("Value '%s' is not a valid Int in locale '%s'\n",
               $value,                          $locale);
}

The result of running this test is:

$ php ./validate-hindi-script.php
Value '१' is not a valid Int in locale 'hi_IN'

This is running in a Cygwin bash command prompt window, and I can't figure out a way to get it to display the utf-8 character correctly. However, if I redirect the output to a file, then dump the bytes with od -cx I get:

$ od -cx foo.log
0000000   V   a   l   u   e       ' 340 245 247   '       i   s       n
       6156    756c    2065    e027    a7a5    2027    7369    6e20
0000020   o   t       a       v   a   l   i   d       I   n   t       i
       746f    6120    7620    6c61    6469    4920    746e    6920
0000040   n       l   o   c   a   l   e       '   h   i   _   I   N   '
       206e    6f6c    6163    656c    2720    6968    495f    274e
0000060  \n
       000a

This is little-endian and the x'27' is the single-quote. So the character that Zend_Validate_Int says is not a valid integer is the UTF-8 sequence e0 a5 a7. And if I go to http://www.fileformat.info/info/charset/UTF-8/list.htm?start=2048 I find:

१     DEVANAGARI DIGIT ONE (U+0967)   e0a5a7

So I think that conclusively proves that your theory does not explain the behavior.

That's an interesting suggestion, and it's neat that you show the script (I copy-pasted into google translate and indeed the string is recognized as Hindi 0123456789), but I don't think it explains the behavior for the following reasons:

  1. The original test program uses Zend_Locale_Format to format the PHP integer values as strings in the hi_IN locale, and the result is unchanged (i.e. Zend_Locale_Format says that '1' is written '1' in locale hi_IN). So perhaps you could argue that Zend_Locale_Format is broken, not Zend_Validate_Int, but...
  2. The latin digits work just fine in hi_IN for values with more than one digit.
  3. I modified the test program to just this:
<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
$locale = 'hi_IN';
$value = '';  // verified with copy-paste to/from google translate: 1 in Hindi script
$validator = new Zend_Validate_Int($locale);
$valid = $validator->isValid($value);
if (! $valid) {
    printf("Value '%s' is not a valid Int in locale '%s'\n",
               $value,                          $locale);
}

The result of running this test is:

$ php ./validate-hindi-script.php
Value '१' is not a valid Int in locale 'hi_IN'

This is running in a Cygwin bash command prompt window, and I can't figure out a way to get it to display the utf-8 character correctly. However, if I redirect the output to a file, then dump the bytes with od -cx I get:

$ od -cx foo.log
0000000   V   a   l   u   e       ' 340 245 247   '       i   s       n
       6156    756c    2065    e027    a7a5    2027    7369    6e20
0000020   o   t       a       v   a   l   i   d       I   n   t       i
       746f    6120    7620    6c61    6469    4920    746e    6920
0000040   n       l   o   c   a   l   e       '   h   i   _   I   N   '
       206e    6f6c    6163    656c    2720    6968    495f    274e
0000060  \n
       000a

This is little-endian and the x'27' is the single-quote. So the character that Zend_Validate_Int says is not a valid integer is the UTF-8 sequence e0 a5 a7. And if I go to http://www.fileformat.info/info/charset/UTF-8/list.htm?start=2048 I find:

१     DEVANAGARI DIGIT ONE (U+0967)   e0a5a7

So I think that conclusively proves that your theory does not explain the behavior.

@weierophinney

This comment has been minimized.

Show comment Hide comment
@weierophinney

weierophinney Jul 17, 2013

Owner

@sootsnoot In ZF, filters and validators are intended to work together. This means that if you want to validate that a number is an integer, you will typically use a filter first to strip any non-numeric characters. My question to you, then is: what does Zend_Filter_Digits return with these values? Can Zend_Validate_Int validate them?

Owner

weierophinney commented Jul 17, 2013

@sootsnoot In ZF, filters and validators are intended to work together. This means that if you want to validate that a number is an integer, you will typically use a filter first to strip any non-numeric characters. My question to you, then is: what does Zend_Filter_Digits return with these values? Can Zend_Validate_Int validate them?

@sootsnoot

This comment has been minimized.

Show comment Hide comment
@sootsnoot

sootsnoot Jul 18, 2013

@weierophinney Thanks for looking at this, Matthew! Naturally I'm most interested in the problem as I reported it, where there is no involvement of filtering. Taking locale 'hi_IN' as a specific example (though the other 64 locales appear to behave the same):

  1. The inputs that Zend_Validate_Int rejects as invalid are only the ten single-ascii-character-digits '0', '1', .. '9'. 2. Each of those ten values is accepted as valid by Zend_Validate_Digits.
  2. Seemingly any sequence of those characters (i.e. a value with more than one digit) is accepted by both Zend_Validate Int and Zend_Validate_Digits.
  3. Each of the ten multibyte UTF-8 Hindi single digits suggested by @siad007 above is rejected as invalid by both Zend_Validate_Digits and Zend_Validate_Int.

However, I massaged the testcase a bit to include both filter and validate (as you suggested), using Zend_Filter_Input. I gave it both the ascii digit '1' and the Hindi character for one, in both the 'en_US' and 'hi_IN' locales as follows:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
// require 'Zend/Filter/Int.php';
require 'Zend/Filter/Input.php';


foreach (array('en_US', 'hi_IN') as $locale) {
    foreach (array('1', '?' /* digit 1 in Hindi script*/ ) as $value) {
        $filters = array('value' => array('StringTrim', 'Digits'));
        $validators = array('value' => new Zend_Validate_Int(array('locale'=>$locale)));
        $input = new Zend_Filter_Input($filters, $validators);
        $input->setData(array('value'=>$value));
        if ($input->isvalid()) {
            printf("Value '%s' *is* a valid Int in locale '%s'\n\n",
                       $value,                          $locale);
        }
        else {
            printf("\nValue '%s' is *NOT* a valid Int in locale '%s':\n",
                       $value,                          $locale);
            // Since just printing to bash command prompt doesn't render this UTF-8 3-byte character
            // usefully, show the bytes in hex...
            printf("...the value in hex is '");
            for ($i = 0; $i < strlen($value); ++$i) {
                printf("%02x ", ord(substr($value, $i, 1)));
            }
            printf("'\n");
            printf("Messages:\n",
                       $value,                          $locale);
            echo var_export($input->getMessages(), true) . "\n\n";
        }
    }
}

Running this produces the following, which to me looks right for en_US, but wrong for hi_IN:

$ php ./filter-validate-hindi-script.php
Value '1' *is* a valid Int in locale 'en_US'


Value 'a�¡�Ñ�¡�ª' is *NOT* a valid Int in locale 'en_US':
...the value in hex is 'e0 a5 a6 '
Messages:
array (
  'value' =>
  array (
'isEmpty' => 'You must give a non-empty value for field \'value\'',
  ),
)


Value '1' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is '31 '
Messages:
array (
  'value' =>
  array (
'notInt' => '\'1\' does not appear to be an integer',
  ),
)


Value 'a�¡�Ñ�¡�ª' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is 'e0 a5 a6 '
Messages:
array (
  'value' =>
  array (
'isEmpty' => 'You must give a non-empty value for field \'value\'',
  ),
)

I'm not very experienced with Zend_Filter_Input, so it's possible I've misused it in some way. While it was clear how to specify a locale to the Zend_Validate component, I didn't see a way to specify a locale to its Zend_Filter component. And the Zend Filter documentation explicitly states for both Digits and Int that there are no options it accepts (while, for example, locale is shown as an option for several other filters). And these filters do not have a setLocale() method like some of the others. However, I blindly experimented, and found that you can specify 'locale' as a key in an options array when creating the filter, or you can store the locale in the 'Zend_Locale' property of Zend_Registry, and these will influence the behavior.

With the filter getting the locale as well as the validator, at first it seems you have hit the nail on the head as you so often do - the Hindi digit for '1' is now accepted as a valid Int in the hi_IN locale as shown:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
require 'Zend/Filter/Input.php';
require 'Zend/Filter/Int.php';
require 'Zend/Filter/LocalizedToNormalized.php';


foreach (array('en_US', 'hi_IN') as $locale) {
    foreach (array('1', '?' /* digit 1 in Hindi script*/ ) as $value) {
        $int_filter = new Zend_Filter_Int(array('locale'=>$locale));
        $norm_filter = new Zend_Filter_LocalizedToNormalized(array('locale'=>$locale));
        $filters = array('value' => array('StringTrim', $int_filter));
        // $filters = array('value' => array('StringTrim', $int_filter, $norm_filter));
        $validators = array('value' => new Zend_Validate_Int(array('locale'=>$locale)));
        $input = new Zend_Filter_Input($filters, $validators);
        $input->setData(array('value'=>$value));
        $isnum = Zend_Locale_Format::isNumber($value, array('locale'=>$locale));
        echo "isNumber('$value') == '" . var_export($isnum, true) . "' in locale '$locale'\n";
        if ($input->isvalid()) {
            printf("Value '%s' *is* a valid Int in locale '%s'\n",
                       $value,                          $locale);
            printf("'%d' should be 43\n", $value + 42);
        }
        else {
            printf("Value '%s' is *NOT* a valid Int in locale '%s':\n",
                       $value,                          $locale);
            // Since just printing to bash command prompt doesn't render
            // the UTF-8 3-byte character usefully, show the bytes in hex...
            printf("...the value in hex is '");
            for ($i = 0; $i < strlen($value); ++$i) {
                printf("%02x ", ord(substr($value, $i, 1)));
            }
            printf("'\n");
            printf("Messages:\n",
                       $value,                          $locale);
            echo var_export($input->getMessages(), true) . "\n";
        }
        echo "\n";
    }
}
$ php ./filter-validate-hindi-script-locale-filter.php
isNumber('1') == 'true' in locale 'en_US'
Value '1' *is* a valid Int in locale 'en_US'
'43' should be 43

isNumber('a�Ñ�ª') == 'false' in locale 'en_US'
Value 'a�Ñ�ª' *is* a valid Int in locale 'en_US'
'42' should be 43

isNumber('1') == 'false' in locale 'hi_IN'
Value '1' *is* a valid Int in locale 'hi_IN'
'43' should be 43

isNumber('a�Ñ�ª') == 'false' in locale 'hi_IN'
Value 'a�Ñ�ª' *is* a valid Int in locale 'hi_IN'
'42' should be 43

However, the Int filter also caused the Hindi digit to become a valid Int in the en_US locale as well as causing the Latin digit to become a valid Int in the hi_IN locale. Maybe the latter is reasonable, since the Latin digits are built in to the programming languages. But accepting the Hindi digit as a valid Int in the en_US locale sounds like a bug - if I validate an Int in the en_US locale, I should be able to convert it to the corresponding PHP int value by adding 0 to it or handing it to scanf - and that just doesn't work. And furthermore, if I'm in the hi_IN locale and get a valid Int, I should be able to produce a "normalized" representation from it that I can confidently use in arithmetic. It appears that neither one of those things happen, as shown by this final testcase.

If I remove the commenting of the line that includes the LocalizedToNormalized filter as the last element of the array of filters to be applied in the previous testcase, then I get the following output, which I truly can't explain. I don't know exactly what the bug is, and I guess I still could be misusing something.

$ php ./filter-validate-hindi-script-locale-filter.php
isNumber('1') == 'true' in locale 'en_US'
Value '1' *is* a valid Int in locale 'en_US'
'43' should be 43

isNumber('a�Ñ�ª') == 'false' in locale 'en_US'
Value 'a�Ñ�ª' *is* a valid Int in locale 'en_US'
'42' should be 43

isNumber('1') == 'false' in locale 'hi_IN'
Value '1' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is '31 '
Messages:
array (
  'value' =>
  array (
'notInt' => '\'dd-MM-yyyy\' does not appear to be an integer',
0 => '\'hi_IN\' does not appear to be an integer',
1 => '\'1\' does not appear to be an integer',
'isEmpty' => 'You must give a non-empty value for field \'value\'',
2 => 'You must give a non-empty value for field \'value\'',
  ),
)

isNumber('a�Ñ�ª') == 'false' in locale 'hi_IN'
Value 'a�Ñ�ª' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is 'e0 a5 a6 '
Messages:
array (
  'value' =>
  array (
'notInt' => '\'dd-MM-yyyy\' does not appear to be an integer',
0 => '\'hi_IN\' does not appear to be an integer',
1 => '\'0\' does not appear to be an integer',
'isEmpty' => 'You must give a non-empty value for field \'value\'',
2 => 'You must give a non-empty value for field \'value\'',
  ),
)

The Messages array certainly looks wrong, where it appears that 'hi_IN' got
mistaken for the value of the 'value' key of the array passed to setData() instead of the 'locale' key of an options array passed to one of the constructors. So then I said okay, I'll just get rid of all the options arrays on the constructors, and use a Zend_Locale object in the registry. Much of the documentation is written to assume that the Zend_Locale in effect is set globally, so I thought that passing the locale in an options array might not be working in all cases. I went ahead and made that change, leaving the LocalizedToNormalized filter in place. Doing that changed exactly one line in the above output: the value of isNumber('1') changed from false to true in the 'hi_IN' locale. But the bogus-looking Messages arrays were identical. For reference here is the exact source using the registry:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
require 'Zend/Filter/Input.php';
require 'Zend/Filter/Int.php';
require 'Zend/Filter/LocalizedToNormalized.php';
require 'Zend/Registry.php';


foreach (array('en_US', 'hi_IN') as $locale) {
    $locale_obj = new Zend_Locale($locale);
    Zend_Registry::set('Zend_Locale', $locale_obj);
    foreach (array('1', '?' /* digit 1 in Hindi script*/ ) as $value) {
        $int_filter = new Zend_Filter_Int();
        $norm_filter = new Zend_Filter_LocalizedToNormalized();
        $filters = array('value' => array('StringTrim', $int_filter));
        $filters = array('value' => array('StringTrim', $int_filter, $norm_filter));
        $validators = array('value' => new Zend_Validate_Int());
        $input = new Zend_Filter_Input($filters, $validators);
        $input->setData(array('value'=>$value));
        $isnum = Zend_Locale_Format::isNumber($value/* , array('locale'=>$locale) */);
        echo "isNumber('$value') == '" . var_export($isnum, true) . "' in locale '$locale'\n";
        if ($input->isvalid()) {
            printf("Value '%s' *is* a valid Int in locale '%s'\n",
                       $value,                          $locale);
            printf("'%d' should be 43\n", $value + 42);
        }
        else {
            printf("Value '%s' is *NOT* a valid Int in locale '%s':\n",
                       $value,                          $locale);
            // Since just printing to bash command prompt doesn't render
            // the UTF-8 3-byte character usefully, show the bytes in hex...
            printf("...the value in hex is '");
            for ($i = 0; $i < strlen($value); ++$i) {
                printf("%02x ", ord(substr($value, $i, 1)));
            }
            printf("'\n");
            printf("Messages:\n",
                       $value,                          $locale);
            echo var_export($input->getMessages(), true) . "\n";
        }
        echo "\n";
    }
}

At this point I'm completely stumped - I really think there is some sort of problem with the ZF code for filters/validators for Ints in the hi_IN locale (and the other 64 shown in the original report). But I can't pinpoint just what it is.

@weierophinney Thanks for looking at this, Matthew! Naturally I'm most interested in the problem as I reported it, where there is no involvement of filtering. Taking locale 'hi_IN' as a specific example (though the other 64 locales appear to behave the same):

  1. The inputs that Zend_Validate_Int rejects as invalid are only the ten single-ascii-character-digits '0', '1', .. '9'. 2. Each of those ten values is accepted as valid by Zend_Validate_Digits.
  2. Seemingly any sequence of those characters (i.e. a value with more than one digit) is accepted by both Zend_Validate Int and Zend_Validate_Digits.
  3. Each of the ten multibyte UTF-8 Hindi single digits suggested by @siad007 above is rejected as invalid by both Zend_Validate_Digits and Zend_Validate_Int.

However, I massaged the testcase a bit to include both filter and validate (as you suggested), using Zend_Filter_Input. I gave it both the ascii digit '1' and the Hindi character for one, in both the 'en_US' and 'hi_IN' locales as follows:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
// require 'Zend/Filter/Int.php';
require 'Zend/Filter/Input.php';


foreach (array('en_US', 'hi_IN') as $locale) {
    foreach (array('1', '?' /* digit 1 in Hindi script*/ ) as $value) {
        $filters = array('value' => array('StringTrim', 'Digits'));
        $validators = array('value' => new Zend_Validate_Int(array('locale'=>$locale)));
        $input = new Zend_Filter_Input($filters, $validators);
        $input->setData(array('value'=>$value));
        if ($input->isvalid()) {
            printf("Value '%s' *is* a valid Int in locale '%s'\n\n",
                       $value,                          $locale);
        }
        else {
            printf("\nValue '%s' is *NOT* a valid Int in locale '%s':\n",
                       $value,                          $locale);
            // Since just printing to bash command prompt doesn't render this UTF-8 3-byte character
            // usefully, show the bytes in hex...
            printf("...the value in hex is '");
            for ($i = 0; $i < strlen($value); ++$i) {
                printf("%02x ", ord(substr($value, $i, 1)));
            }
            printf("'\n");
            printf("Messages:\n",
                       $value,                          $locale);
            echo var_export($input->getMessages(), true) . "\n\n";
        }
    }
}

Running this produces the following, which to me looks right for en_US, but wrong for hi_IN:

$ php ./filter-validate-hindi-script.php
Value '1' *is* a valid Int in locale 'en_US'


Value 'a�¡�Ñ�¡�ª' is *NOT* a valid Int in locale 'en_US':
...the value in hex is 'e0 a5 a6 '
Messages:
array (
  'value' =>
  array (
'isEmpty' => 'You must give a non-empty value for field \'value\'',
  ),
)


Value '1' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is '31 '
Messages:
array (
  'value' =>
  array (
'notInt' => '\'1\' does not appear to be an integer',
  ),
)


Value 'a�¡�Ñ�¡�ª' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is 'e0 a5 a6 '
Messages:
array (
  'value' =>
  array (
'isEmpty' => 'You must give a non-empty value for field \'value\'',
  ),
)

I'm not very experienced with Zend_Filter_Input, so it's possible I've misused it in some way. While it was clear how to specify a locale to the Zend_Validate component, I didn't see a way to specify a locale to its Zend_Filter component. And the Zend Filter documentation explicitly states for both Digits and Int that there are no options it accepts (while, for example, locale is shown as an option for several other filters). And these filters do not have a setLocale() method like some of the others. However, I blindly experimented, and found that you can specify 'locale' as a key in an options array when creating the filter, or you can store the locale in the 'Zend_Locale' property of Zend_Registry, and these will influence the behavior.

With the filter getting the locale as well as the validator, at first it seems you have hit the nail on the head as you so often do - the Hindi digit for '1' is now accepted as a valid Int in the hi_IN locale as shown:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
require 'Zend/Filter/Input.php';
require 'Zend/Filter/Int.php';
require 'Zend/Filter/LocalizedToNormalized.php';


foreach (array('en_US', 'hi_IN') as $locale) {
    foreach (array('1', '?' /* digit 1 in Hindi script*/ ) as $value) {
        $int_filter = new Zend_Filter_Int(array('locale'=>$locale));
        $norm_filter = new Zend_Filter_LocalizedToNormalized(array('locale'=>$locale));
        $filters = array('value' => array('StringTrim', $int_filter));
        // $filters = array('value' => array('StringTrim', $int_filter, $norm_filter));
        $validators = array('value' => new Zend_Validate_Int(array('locale'=>$locale)));
        $input = new Zend_Filter_Input($filters, $validators);
        $input->setData(array('value'=>$value));
        $isnum = Zend_Locale_Format::isNumber($value, array('locale'=>$locale));
        echo "isNumber('$value') == '" . var_export($isnum, true) . "' in locale '$locale'\n";
        if ($input->isvalid()) {
            printf("Value '%s' *is* a valid Int in locale '%s'\n",
                       $value,                          $locale);
            printf("'%d' should be 43\n", $value + 42);
        }
        else {
            printf("Value '%s' is *NOT* a valid Int in locale '%s':\n",
                       $value,                          $locale);
            // Since just printing to bash command prompt doesn't render
            // the UTF-8 3-byte character usefully, show the bytes in hex...
            printf("...the value in hex is '");
            for ($i = 0; $i < strlen($value); ++$i) {
                printf("%02x ", ord(substr($value, $i, 1)));
            }
            printf("'\n");
            printf("Messages:\n",
                       $value,                          $locale);
            echo var_export($input->getMessages(), true) . "\n";
        }
        echo "\n";
    }
}
$ php ./filter-validate-hindi-script-locale-filter.php
isNumber('1') == 'true' in locale 'en_US'
Value '1' *is* a valid Int in locale 'en_US'
'43' should be 43

isNumber('a�Ñ�ª') == 'false' in locale 'en_US'
Value 'a�Ñ�ª' *is* a valid Int in locale 'en_US'
'42' should be 43

isNumber('1') == 'false' in locale 'hi_IN'
Value '1' *is* a valid Int in locale 'hi_IN'
'43' should be 43

isNumber('a�Ñ�ª') == 'false' in locale 'hi_IN'
Value 'a�Ñ�ª' *is* a valid Int in locale 'hi_IN'
'42' should be 43

However, the Int filter also caused the Hindi digit to become a valid Int in the en_US locale as well as causing the Latin digit to become a valid Int in the hi_IN locale. Maybe the latter is reasonable, since the Latin digits are built in to the programming languages. But accepting the Hindi digit as a valid Int in the en_US locale sounds like a bug - if I validate an Int in the en_US locale, I should be able to convert it to the corresponding PHP int value by adding 0 to it or handing it to scanf - and that just doesn't work. And furthermore, if I'm in the hi_IN locale and get a valid Int, I should be able to produce a "normalized" representation from it that I can confidently use in arithmetic. It appears that neither one of those things happen, as shown by this final testcase.

If I remove the commenting of the line that includes the LocalizedToNormalized filter as the last element of the array of filters to be applied in the previous testcase, then I get the following output, which I truly can't explain. I don't know exactly what the bug is, and I guess I still could be misusing something.

$ php ./filter-validate-hindi-script-locale-filter.php
isNumber('1') == 'true' in locale 'en_US'
Value '1' *is* a valid Int in locale 'en_US'
'43' should be 43

isNumber('a�Ñ�ª') == 'false' in locale 'en_US'
Value 'a�Ñ�ª' *is* a valid Int in locale 'en_US'
'42' should be 43

isNumber('1') == 'false' in locale 'hi_IN'
Value '1' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is '31 '
Messages:
array (
  'value' =>
  array (
'notInt' => '\'dd-MM-yyyy\' does not appear to be an integer',
0 => '\'hi_IN\' does not appear to be an integer',
1 => '\'1\' does not appear to be an integer',
'isEmpty' => 'You must give a non-empty value for field \'value\'',
2 => 'You must give a non-empty value for field \'value\'',
  ),
)

isNumber('a�Ñ�ª') == 'false' in locale 'hi_IN'
Value 'a�Ñ�ª' is *NOT* a valid Int in locale 'hi_IN':
...the value in hex is 'e0 a5 a6 '
Messages:
array (
  'value' =>
  array (
'notInt' => '\'dd-MM-yyyy\' does not appear to be an integer',
0 => '\'hi_IN\' does not appear to be an integer',
1 => '\'0\' does not appear to be an integer',
'isEmpty' => 'You must give a non-empty value for field \'value\'',
2 => 'You must give a non-empty value for field \'value\'',
  ),
)

The Messages array certainly looks wrong, where it appears that 'hi_IN' got
mistaken for the value of the 'value' key of the array passed to setData() instead of the 'locale' key of an options array passed to one of the constructors. So then I said okay, I'll just get rid of all the options arrays on the constructors, and use a Zend_Locale object in the registry. Much of the documentation is written to assume that the Zend_Locale in effect is set globally, so I thought that passing the locale in an options array might not be working in all cases. I went ahead and made that change, leaving the LocalizedToNormalized filter in place. Doing that changed exactly one line in the above output: the value of isNumber('1') changed from false to true in the 'hi_IN' locale. But the bogus-looking Messages arrays were identical. For reference here is the exact source using the registry:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Validate/Int.php';
require 'Zend/Filter/Input.php';
require 'Zend/Filter/Int.php';
require 'Zend/Filter/LocalizedToNormalized.php';
require 'Zend/Registry.php';


foreach (array('en_US', 'hi_IN') as $locale) {
    $locale_obj = new Zend_Locale($locale);
    Zend_Registry::set('Zend_Locale', $locale_obj);
    foreach (array('1', '?' /* digit 1 in Hindi script*/ ) as $value) {
        $int_filter = new Zend_Filter_Int();
        $norm_filter = new Zend_Filter_LocalizedToNormalized();
        $filters = array('value' => array('StringTrim', $int_filter));
        $filters = array('value' => array('StringTrim', $int_filter, $norm_filter));
        $validators = array('value' => new Zend_Validate_Int());
        $input = new Zend_Filter_Input($filters, $validators);
        $input->setData(array('value'=>$value));
        $isnum = Zend_Locale_Format::isNumber($value/* , array('locale'=>$locale) */);
        echo "isNumber('$value') == '" . var_export($isnum, true) . "' in locale '$locale'\n";
        if ($input->isvalid()) {
            printf("Value '%s' *is* a valid Int in locale '%s'\n",
                       $value,                          $locale);
            printf("'%d' should be 43\n", $value + 42);
        }
        else {
            printf("Value '%s' is *NOT* a valid Int in locale '%s':\n",
                       $value,                          $locale);
            // Since just printing to bash command prompt doesn't render
            // the UTF-8 3-byte character usefully, show the bytes in hex...
            printf("...the value in hex is '");
            for ($i = 0; $i < strlen($value); ++$i) {
                printf("%02x ", ord(substr($value, $i, 1)));
            }
            printf("'\n");
            printf("Messages:\n",
                       $value,                          $locale);
            echo var_export($input->getMessages(), true) . "\n";
        }
        echo "\n";
    }
}

At this point I'm completely stumped - I really think there is some sort of problem with the ZF code for filters/validators for Ints in the hi_IN locale (and the other 64 shown in the original report). But I can't pinpoint just what it is.

@sootsnoot

This comment has been minimized.

Show comment Hide comment
@sootsnoot

sootsnoot Jul 18, 2013

As if I hadn't already ground out enough on this - I just noticed that in the first testcase of my previous comment I used the Digits filter (as suggested by @weierophinney), but then in the subsequent testcases I switch to using the Int filter instead (because I thought it more likely to respond to a locale option). Just for "fun"(??) I tweaked the final testcase to use the Digits filter instead of Int. The only effect it had was to change the Hindi digit in the 'en_US' locale from being considered a valid Int to being invalid (with the isEmpty message, i.e. the filter removed it before the validator saw it).

As if I hadn't already ground out enough on this - I just noticed that in the first testcase of my previous comment I used the Digits filter (as suggested by @weierophinney), but then in the subsequent testcases I switch to using the Int filter instead (because I thought it more likely to respond to a locale option). Just for "fun"(??) I tweaked the final testcase to use the Digits filter instead of Int. The only effect it had was to change the Hindi digit in the 'en_US' locale from being considered a valid Int to being invalid (with the isEmpty message, i.e. the filter removed it before the validator saw it).

@sootsnoot

This comment has been minimized.

Show comment Hide comment
@sootsnoot

sootsnoot Jul 20, 2013

I deleted a post I made yesterday because I did not seem to be getting results consistent with previous posts. I'm triple-checking the code and corresponding results in this post

In the very last testcase of my previous long post (not the one I deleted), I intended to show the code with explicit options arrays specifying the locale removed and replaced by storing a Zend_Locale object in the Zend_Registry with key 'Zend_Locale'. But at the same time I added a LocalizedToNormalized filter, and it's never a good idea to make two changes at the same time when looking for changes in the output. And with some additional testing I found that I had specified the filters in the wrong order: if it's to be useful, you need to run the LocalizedToNormalized filter before the Int filter, not after it. And to top that off, the way I copy-pasted the code turned the Hindi digit for '1' into a '?' replacement character.

Below is a new test program with the locale specified in the registry, and focused more directly on the problem encountered in our application (whose bootstrap also stores the locale determined from the browser in the registry). Other tests have shown that Hindi digits don't produce numeric values, so I got rid of those because they're a nuisance to copy-paste, are not something this application intends to deal with, and they make the output too long. Other tests also demonstrated to me that proper use of the LocalizedToNormalized filter removed the comma from large values with latin digits like "1,234" in both the 'en_US' and 'hi_IN' locales; but that's not relevant to the problem encountered, so I'm not using that filter or values with commas. The actual problem centers on the treatment of single-ascii-digit vs multiple-ascii-digit integers in the 'hi_IN' locale (and presumably the other 64 listed) vs the 'en_US' locale (and others including multi-byte locales like 'ja_JP', 'zh_CN', 'lo_LA').

The reason that 'hi_IN' is a problem for me is that I test the completeness of our application's gettext translation usage by taking its message catalog for en_US and generating a hi_IN catalog from it automatically, where every message gets a "translation" consisting of the original English text with a couple of Hindi characters prepended to it. In this way I can set my browser's locale to 'hi_IN', and then run the web application looking for text that does not begin with those Hindi characters. Hindi was chosen simply because the characters stand out very clearly to the eye. The method also helps ensure that UTF-8 encoding is in place everywhere (e.g. in subject and text of application messages sent by email). We don't support Hindi as an input language on forms - but by convention our controllers use Zend_Input_Filter to validate request parameters. Normally the request parameter values will be auto-increment primary key values from the database, but obviously they can be typed manually, and we don't want to accept garbage. So in this situation I don't particularly need or want filtering, I just want to validate that the values as-is are integers and give an error if they're not. As a safety practice our controllers typically use filters 'HtmlEntities', 'StripTags', 'StringTrim' on all request parameters. And for validating integer values they use the 'Int' validator. This works fine in the 'en_US' locale and most others, but fails when testing for i18n completeness using 'hi_IN' - that's what triggered the bug report. My simplest workaround is just to use a different multi-byte locale with eye-catching glyphs, such as lo_LA.

Having said all that, this will be my last post on the subject, to provide a test program that I believe demonstrates the peculiar nature of the problem most clearly. If anyone has an explanation for why the behavior is correct, I'd love to hear it. And if anyone has questions or would like me to do something more with the test code, I'd be happy to try. But for the application in question, I've already switched from using hi_IN to lo_LA to look for missing translation hooks in the source, so I'm no longer seeing the problematic failures.

What I found, as demonstrated by the test program below, is that for the Int validator to work properly on single-digit ascii values in the 'hi_IN' locale, the value must be filtered specifically by the Int filter. Filtering with the Digits filter, or a filter that passes the value unchanged, causes a single digit to fail Int validation. If the value consists of more than one ascii digit, the filter used doesn't matter, the value validates successfully with Int.

Here is the test program:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Registry.php';
require 'Zend/Filter/Digits.php';
require 'Zend/Filter/Int.php';
require 'Zend/Filter/Callback.php';
require 'Zend/Validate/Int.php';
require 'Zend/Validate/Digits.php';
require 'Zend/Validate/Callback.php';
require 'Zend/Filter/Input.php';

function passFilter($val) {
    return $val;
}

function anyValid($val) {
    return true;
}

foreach (array('en_US', 'lo_LA', 'hi_IN') as $locale) {
    printf("    =====================================\n");
    printf("    ==== Testing with locale '$locale' ====\n");
    printf("    =====================================\n");
    $locale_obj = new Zend_Locale($locale);
    Zend_Registry::set('Zend_Locale', $locale_obj);
    $validator_choices = array(
        'Int'    => new Zend_Validate_Int(),
        'Digits' => new Zend_Validate_Digits(),
        'Any'    => new Zend_Validate_Callback('anyValid'),
    );
    $filter_choices = array(
        'Int'    => new Zend_Filter_Int(),
        'Digits' => new Zend_Filter_Digits(),
        'Pass'   => new Zend_Filter_Callback('passFilter'),
    );
    foreach (array('12', '1',
                /* same two strings as above, but using Hindi digits */
                /* '१२', '१', */) as $value) {
        foreach ($validator_choices as $validator_name => $validator_object) {
            foreach ($filter_choices as $filter_name => $filter_object) {
                $isnum = Zend_Locale_Format::isNumber($value);
                echo "    isNumber('$value') == '" . var_export($isnum, true) . "' in locale '$locale'\n";
                $filters = array('value' => array('HtmlEntities', 'StripTags', 'StringTrim', $filter_object));
                $validators = array('value' => $validator_object);
                $input = new Zend_Filter_Input($filters, $validators);
                $input->setData(array('value'=>$value));
                printf ("    Filters '%s', validator '%s':\n",
                               implode(",", array('HtmlEntities', 'StripTags', 'StringTrim', $filter_name)), $validator_name);
                if ($input->isvalid()) {
                    $filtered = $input->getEscaped('value');
                    if ($filtered != $value) {
                        printf("    Raw value '%s' *was filtered* to '%s', which is valid in locale '%s'\n",
                                       $value,       $filtered,                              $locale);
                    }
                    if (!(is_numeric($filtered) && is_int(0 + $filtered))) {
                        printf("    ...Zend_Filter_Input says the raw value '%s' is valid, PHP says the filtered value '%s' is not an int\n", $value, $filtered);
                    }
                }
                else {
                    printf("    *** Raw value '%s' was *NOT* filtered to a valid value in locale '%s':\n",
                                           $value,                          $locale);
                    if (is_numeric($value)) {
                        printf("    ...PHP says the raw value '%s' is numeric\n", $value);
                        if (is_int($value + 0)) {
                            printf("    ...PHP says the raw value '%s' + 0 is an int\n", $value);
                        }
                    }
                    // Since just printing to bash command prompt doesn't render
                    // the UTF-8 3-byte characters usefully, show the bytes in hex...
                    printf("    ...the raw value in hex is '");
                    for ($i = 0; $i < strlen($value); ++$i) {
                        printf("%02x", ord(substr($value, $i, 1)));
                        if ($i + 1 < strlen($value)) {
                            printf(" ");
                        }
                    }
                    printf("'\n");
                    printf("    Messages:\n",
                               $value,                          $locale);
                    echo var_export($input->getMessages(), true) . "\n";
                }
                echo "\n";
            }
        }
    }
}

Here is the output:

=====================================
==== Testing with locale 'en_US' ====
=====================================
isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':

=====================================
==== Testing with locale 'hi_IN' ====
=====================================
isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':
*** Raw value '1' was *NOT* filtered to a valid value in locale 'hi_IN':
...PHP says the raw value '1' is numeric
...PHP says the raw value '1' + 0 is an int
...the raw value in hex is '31'
Messages:
array (
  'value' =>
  array (
    'notInt' => '\'1\' does not appear to be an integer',
  ),
)

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':
*** Raw value '1' was *NOT* filtered to a valid value in locale 'hi_IN':
...PHP says the raw value '1' is numeric
...PHP says the raw value '1' + 0 is an int
...the raw value in hex is '31'
Messages:
array (
  'value' =>
  array (
    'notInt' => '\'1\' does not appear to be an integer',
  ),
)

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':

I deleted a post I made yesterday because I did not seem to be getting results consistent with previous posts. I'm triple-checking the code and corresponding results in this post

In the very last testcase of my previous long post (not the one I deleted), I intended to show the code with explicit options arrays specifying the locale removed and replaced by storing a Zend_Locale object in the Zend_Registry with key 'Zend_Locale'. But at the same time I added a LocalizedToNormalized filter, and it's never a good idea to make two changes at the same time when looking for changes in the output. And with some additional testing I found that I had specified the filters in the wrong order: if it's to be useful, you need to run the LocalizedToNormalized filter before the Int filter, not after it. And to top that off, the way I copy-pasted the code turned the Hindi digit for '1' into a '?' replacement character.

Below is a new test program with the locale specified in the registry, and focused more directly on the problem encountered in our application (whose bootstrap also stores the locale determined from the browser in the registry). Other tests have shown that Hindi digits don't produce numeric values, so I got rid of those because they're a nuisance to copy-paste, are not something this application intends to deal with, and they make the output too long. Other tests also demonstrated to me that proper use of the LocalizedToNormalized filter removed the comma from large values with latin digits like "1,234" in both the 'en_US' and 'hi_IN' locales; but that's not relevant to the problem encountered, so I'm not using that filter or values with commas. The actual problem centers on the treatment of single-ascii-digit vs multiple-ascii-digit integers in the 'hi_IN' locale (and presumably the other 64 listed) vs the 'en_US' locale (and others including multi-byte locales like 'ja_JP', 'zh_CN', 'lo_LA').

The reason that 'hi_IN' is a problem for me is that I test the completeness of our application's gettext translation usage by taking its message catalog for en_US and generating a hi_IN catalog from it automatically, where every message gets a "translation" consisting of the original English text with a couple of Hindi characters prepended to it. In this way I can set my browser's locale to 'hi_IN', and then run the web application looking for text that does not begin with those Hindi characters. Hindi was chosen simply because the characters stand out very clearly to the eye. The method also helps ensure that UTF-8 encoding is in place everywhere (e.g. in subject and text of application messages sent by email). We don't support Hindi as an input language on forms - but by convention our controllers use Zend_Input_Filter to validate request parameters. Normally the request parameter values will be auto-increment primary key values from the database, but obviously they can be typed manually, and we don't want to accept garbage. So in this situation I don't particularly need or want filtering, I just want to validate that the values as-is are integers and give an error if they're not. As a safety practice our controllers typically use filters 'HtmlEntities', 'StripTags', 'StringTrim' on all request parameters. And for validating integer values they use the 'Int' validator. This works fine in the 'en_US' locale and most others, but fails when testing for i18n completeness using 'hi_IN' - that's what triggered the bug report. My simplest workaround is just to use a different multi-byte locale with eye-catching glyphs, such as lo_LA.

Having said all that, this will be my last post on the subject, to provide a test program that I believe demonstrates the peculiar nature of the problem most clearly. If anyone has an explanation for why the behavior is correct, I'd love to hear it. And if anyone has questions or would like me to do something more with the test code, I'd be happy to try. But for the application in question, I've already switched from using hi_IN to lo_LA to look for missing translation hooks in the source, so I'm no longer seeing the problematic failures.

What I found, as demonstrated by the test program below, is that for the Int validator to work properly on single-digit ascii values in the 'hi_IN' locale, the value must be filtered specifically by the Int filter. Filtering with the Digits filter, or a filter that passes the value unchanged, causes a single digit to fail Int validation. If the value consists of more than one ascii digit, the filter used doesn't matter, the value validates successfully with Int.

Here is the test program:

<?php
// Put library on include_path
set_include_path(implode(PATH_SEPARATOR, array(
    realpath('./library'),
    get_include_path()
)));

require 'Zend/Registry.php';
require 'Zend/Filter/Digits.php';
require 'Zend/Filter/Int.php';
require 'Zend/Filter/Callback.php';
require 'Zend/Validate/Int.php';
require 'Zend/Validate/Digits.php';
require 'Zend/Validate/Callback.php';
require 'Zend/Filter/Input.php';

function passFilter($val) {
    return $val;
}

function anyValid($val) {
    return true;
}

foreach (array('en_US', 'lo_LA', 'hi_IN') as $locale) {
    printf("    =====================================\n");
    printf("    ==== Testing with locale '$locale' ====\n");
    printf("    =====================================\n");
    $locale_obj = new Zend_Locale($locale);
    Zend_Registry::set('Zend_Locale', $locale_obj);
    $validator_choices = array(
        'Int'    => new Zend_Validate_Int(),
        'Digits' => new Zend_Validate_Digits(),
        'Any'    => new Zend_Validate_Callback('anyValid'),
    );
    $filter_choices = array(
        'Int'    => new Zend_Filter_Int(),
        'Digits' => new Zend_Filter_Digits(),
        'Pass'   => new Zend_Filter_Callback('passFilter'),
    );
    foreach (array('12', '1',
                /* same two strings as above, but using Hindi digits */
                /* '१२', '१', */) as $value) {
        foreach ($validator_choices as $validator_name => $validator_object) {
            foreach ($filter_choices as $filter_name => $filter_object) {
                $isnum = Zend_Locale_Format::isNumber($value);
                echo "    isNumber('$value') == '" . var_export($isnum, true) . "' in locale '$locale'\n";
                $filters = array('value' => array('HtmlEntities', 'StripTags', 'StringTrim', $filter_object));
                $validators = array('value' => $validator_object);
                $input = new Zend_Filter_Input($filters, $validators);
                $input->setData(array('value'=>$value));
                printf ("    Filters '%s', validator '%s':\n",
                               implode(",", array('HtmlEntities', 'StripTags', 'StringTrim', $filter_name)), $validator_name);
                if ($input->isvalid()) {
                    $filtered = $input->getEscaped('value');
                    if ($filtered != $value) {
                        printf("    Raw value '%s' *was filtered* to '%s', which is valid in locale '%s'\n",
                                       $value,       $filtered,                              $locale);
                    }
                    if (!(is_numeric($filtered) && is_int(0 + $filtered))) {
                        printf("    ...Zend_Filter_Input says the raw value '%s' is valid, PHP says the filtered value '%s' is not an int\n", $value, $filtered);
                    }
                }
                else {
                    printf("    *** Raw value '%s' was *NOT* filtered to a valid value in locale '%s':\n",
                                           $value,                          $locale);
                    if (is_numeric($value)) {
                        printf("    ...PHP says the raw value '%s' is numeric\n", $value);
                        if (is_int($value + 0)) {
                            printf("    ...PHP says the raw value '%s' + 0 is an int\n", $value);
                        }
                    }
                    // Since just printing to bash command prompt doesn't render
                    // the UTF-8 3-byte characters usefully, show the bytes in hex...
                    printf("    ...the raw value in hex is '");
                    for ($i = 0; $i < strlen($value); ++$i) {
                        printf("%02x", ord(substr($value, $i, 1)));
                        if ($i + 1 < strlen($value)) {
                            printf(" ");
                        }
                    }
                    printf("'\n");
                    printf("    Messages:\n",
                               $value,                          $locale);
                    echo var_export($input->getMessages(), true) . "\n";
                }
                echo "\n";
            }
        }
    }
}

Here is the output:

=====================================
==== Testing with locale 'en_US' ====
=====================================
isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('12') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('1') == 'true' in locale 'en_US'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':

=====================================
==== Testing with locale 'hi_IN' ====
=====================================
isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('12') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Int':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Int':
*** Raw value '1' was *NOT* filtered to a valid value in locale 'hi_IN':
...PHP says the raw value '1' is numeric
...PHP says the raw value '1' + 0 is an int
...the raw value in hex is '31'
Messages:
array (
  'value' =>
  array (
    'notInt' => '\'1\' does not appear to be an integer',
  ),
)

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Int':
*** Raw value '1' was *NOT* filtered to a valid value in locale 'hi_IN':
...PHP says the raw value '1' is numeric
...PHP says the raw value '1' + 0 is an int
...the raw value in hex is '31'
Messages:
array (
  'value' =>
  array (
    'notInt' => '\'1\' does not appear to be an integer',
  ),
)

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Digits':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Digits':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Digits':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Int', validator 'Any':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Digits', validator 'Any':

isNumber('1') == 'true' in locale 'hi_IN'
Filters 'HtmlEntities,StripTags,StringTrim,Pass', validator 'Any':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment