-
-
Notifications
You must be signed in to change notification settings - Fork 6.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
enhance Inflector helper with ascii function #464
Changes from 5 commits
ea8b6e7
10c52d7
6546168
e4e1871
9ff2580
0aab6de
6b3abee
fc9fb80
26330b9
b7904c4
08aaeda
6f932d9
61591ea
b918fa5
8c1715d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,7 +50,7 @@ class Inflector | |
'/(ax|cris|test)is$/i' => '\1es', | ||
'/s$/' => 's', | ||
'/^$/' => '', | ||
'/$/' => 's', | ||
'/$/' => 's', | ||
); | ||
/** | ||
* @var array the rules for converting a word into its singular form. | ||
|
@@ -94,7 +94,7 @@ class Inflector | |
'/(n)ews$/i' => '\1\2ews', | ||
'/eaus$/' => 'eau', | ||
'/^(.*us)$/' => '\\1', | ||
'/s$/i' => '', | ||
'/s$/i' => '', | ||
); | ||
/** | ||
* @var array the special rules for converting a word between its plural form and singular form. | ||
|
@@ -214,59 +214,90 @@ class Inflector | |
'Yengeese' => 'Yengeese', | ||
); | ||
/** | ||
* @var array map of special chars and its translation. This is used by [[slug()]]. | ||
* @var array map of special chars and its translation. This is used by [[slug()]] and [[ascii()]]. | ||
*/ | ||
public static $transliteration = array( | ||
protected static $transliteration = array( | ||
'/Æ|Ǽ/' => 'AE', | ||
'/ä|æ|ǽ/' => 'ae', | ||
'/ö|œ/' => 'oe', | ||
'/ü/' => 'ue', | ||
'/Ä/' => 'Ae', | ||
'/Ü/' => 'Ue', | ||
'/Ö/' => 'Oe', | ||
'/À|Á|Â|Ã|Å|Ǻ|Ā|Ă|Ą|Ǎ/' => 'A', | ||
'/à|á|â|ã|å|ǻ|ā|ă|ą|ǎ|ª/' => 'a', | ||
'/Ç|Ć|Ĉ|Ċ|Č/' => 'C', | ||
'/ç|ć|ĉ|ċ|č/' => 'c', | ||
'/Ð|Ď|Đ/' => 'D', | ||
'/ð|ď|đ/' => 'd', | ||
'/È|É|Ê|Ë|Ē|Ĕ|Ė|Ę|Ě/' => 'E', | ||
'/è|é|ê|ë|ē|ĕ|ė|ę|ě/' => 'e', | ||
'/Ĝ|Ğ|Ġ|Ģ/' => 'G', | ||
'/ĝ|ğ|ġ|ģ/' => 'g', | ||
'/Ĥ|Ħ/' => 'H', | ||
'/ĥ|ħ/' => 'h', | ||
'/Ì|Í|Î|Ï|Ĩ|Ī|Ĭ|Ǐ|Į|İ/' => 'I', | ||
'/ì|í|î|ï|ĩ|ī|ĭ|ǐ|į|ı/' => 'i', | ||
'/Ĵ/' => 'J', | ||
'/ĵ/' => 'j', | ||
'/Ψ/' => 'PS', | ||
'/ψ/' => 'ps', | ||
'/À|Á|Â|Ã|Å|Ǻ|Ā|Ă|Ą|Ǎ|Ά/' => 'A', | ||
'/à|á|â|ã|å|ǻ|ā|ă|ą|ǎ|ª|ά/' => 'a', | ||
'/Б/' => 'B', | ||
'/β|б/' => 'b', | ||
'/Ç|Ć|Ĉ|Ċ|Č|Ц/' => 'C', | ||
'/ç|ć|ĉ|ċ|č|ц/' => 'c', | ||
'/Ч/' => 'Ch', | ||
'/ч/' => 'ch', | ||
'/©/' => '(c)', | ||
'/Ð|Ď|Đ|Δ|Д/' => 'D', | ||
'/ð|ď|đ|δ|д/' => 'd', | ||
'/È|É|Ê|Ë|Ē|Ĕ|Ė|Ę|Ě|Έ|Э/' => 'E', | ||
'/è|é|ê|ë|ē|ĕ|ė|ę|ě|ε|έ|э/' => 'e', | ||
'/Φ|Ф/' => 'F', | ||
'/φ|ƒ|ф/' => 'f', | ||
'/Ĝ|Ğ|Ġ|Ģ|Γ|Ґ/' => 'G', | ||
'/ĝ|ğ|ġ|ģ|γ|г|ґ/' => 'g', | ||
'/Ĥ|Ħ|Ή/' => 'H', | ||
'/ĥ|ħ|η|ή|н|х/' => 'h', | ||
'/Ì|Í|Î|Ï|Ĩ|Ī|Ĭ|Ǐ|Į|İ|Ί|И/' => 'I', | ||
'/ì|í|î|ï|ĩ|ī|ĭ|ǐ|į|ı|ι|ί|ϊ|ΐ|и/' => 'i', | ||
'/Ĵ|Й/' => 'J', | ||
'/ĵ|й/' => 'j', | ||
'/Ķ/' => 'K', | ||
'/ķ/' => 'k', | ||
'/Ĺ|Ļ|Ľ|Ŀ|Ł/' => 'L', | ||
'/ĺ|ļ|ľ|ŀ|ł/' => 'l', | ||
'/ķ|κ/' => 'k', | ||
'/Ĺ|Ļ|Ľ|Ŀ|Ł|Λ|Л/' => 'L', | ||
'/ĺ|ļ|ľ|ŀ|ł|λ|л/' => 'l', | ||
'/μ|м/' => 'm', | ||
'/Ñ|Ń|Ņ|Ň/' => 'N', | ||
'/ñ|ń|ņ|ň|ʼn/' => 'n', | ||
'/Ò|Ó|Ô|Õ|Ō|Ŏ|Ǒ|Ő|Ơ|Ø|Ǿ/' => 'O', | ||
'/ò|ó|ô|õ|ō|ŏ|ǒ|ő|ơ|ø|ǿ|º/' => 'o', | ||
'/ñ|ń|ņ|ň|ʼn|ν/' => 'n', | ||
'/Ò|Ó|Ô|Õ|Ō|Ŏ|Ǒ|Ő|Ơ|Ø|Ǿ|Ό/' => 'O', | ||
'/ò|ó|ô|õ|ō|ŏ|ǒ|ő|ơ|ø|ǿ|º|ο/' => 'o', | ||
'/Π/' => 'P', | ||
'/π|п/' => 'p', | ||
'/Ŕ|Ŗ|Ř/' => 'R', | ||
'/ŕ|ŗ|ř/' => 'r', | ||
'/Ś|Ŝ|Ş|Ș|Š/' => 'S', | ||
'/ś|ŝ|ş|ș|š|ſ/' => 's', | ||
'/Ţ|Ț|Ť|Ŧ/' => 'T', | ||
'/ţ|ț|ť|ŧ/' => 't', | ||
'/Ù|Ú|Û|Ũ|Ū|Ŭ|Ů|Ű|Ų|Ư|Ǔ|Ǖ|Ǘ|Ǚ|Ǜ/' => 'U', | ||
'/ù|ú|û|ũ|ū|ŭ|ů|ű|ų|ư|ǔ|ǖ|ǘ|ǚ|ǜ/' => 'u', | ||
'/Ý|Ÿ|Ŷ/' => 'Y', | ||
'/ý|ÿ|ŷ/' => 'y', | ||
'/Ŵ/' => 'W', | ||
'/ŵ/' => 'w', | ||
'/Ź|Ż|Ž/' => 'Z', | ||
'/ź|ż|ž/' => 'z', | ||
'/Æ|Ǽ/' => 'AE', | ||
'/ŕ|ŗ|ř|ρ|р/' => 'r', | ||
'/Ś|Ŝ|Ş|Ș|Š|Σ/' => 'S', | ||
'/ś|ŝ|ş|ș|š|ſ|σ|ς|с/' => 's', | ||
'/ß/' => 'ss', | ||
'/ẞ/' => 'SS', | ||
'/Ţ|Ț|Ť|Ŧ|τ/' => 'T', | ||
'/ţ|ț|ť|ŧ|т/' => 't', | ||
'/Ù|Ú|Û|Ũ|Ū|Ŭ|Ů|Ű|Ų|Ư|Ǔ|Ǖ|Ǘ|Ǚ|Ǜ|У/' => 'U', | ||
'/ù|ú|û|ũ|ū|ŭ|ů|ű|ų|ư|ǔ|ǖ|ǘ|ǚ|ǜ/' => 'u', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you sure it is the same character? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cebe yes... shouldn't we then convert the chars to their Unicode to make sure we are replacing the correct ones? I believe that allowing custom rules to override the ones provided would solve these issues, and everyone will be able to provide their own set of rules to add or override the ones provided... The list is huge... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The file is saved as utf8 so they are unicode, or what do you mean?
I think we already have the most important ones. Just need to make sure they are correct. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we can try to automatically create rules using Unicode table. http://www.utf8-zeichentabelle.de/ not sure if it will work good. |
||
'/в/' => 'v', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another reference for unicode character table: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cebe All fine here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cebe If i'm not wrong CLDR have transliteration rules for every language. Why not just use it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO Transliteration I believe, that we should only provide transliteration for romanic languages (maybe include What do you guys think? @qiangxue @creocoder @andersonamuller @cebe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Better to support all languages based on cyrillic script, not just russian. They all have a very small set of different characters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @resurtm http://www.unicode.org/charts/PDF/U0400.pdf Any other reference table to include? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is OK. I'm unsure whether Old Church Slavonic used anywhere nowadays. It's like Old English—i've never seen any modern texts which uses it. Looks meaningless as well. |
||
'/χ/' => 'x', | ||
'/Ý|Ÿ|Ŷ|Ύ|Ϋ/' => 'Y', | ||
'/ý|ÿ|ŷ|υ|ύ|ΰ|ы/' => 'y', | ||
'/Ŵ|Ω|Ώ/' => 'W', | ||
'/ŵ|ω|ώ/' => 'w', | ||
'/Ź|Ż|Ž|З/' => 'Z', | ||
'/ź|ż|ž|ζ|з/' => 'z', | ||
'/IJ/' => 'IJ', | ||
'/ij/' => 'ij', | ||
'/Œ/' => 'OE', | ||
'/ƒ/' => 'f' | ||
'/Ш|Щ/' => 'Sh', | ||
'/ш|щ/' => 'sh', | ||
'/Я/' => 'Ya', | ||
'/я/' => 'ya', | ||
'/Є/' => 'Ye', | ||
'/є/' => 'ye', | ||
'/Ї/' => 'Yi', | ||
'/ї/' => 'yi', | ||
'/Ё/' => 'Yo', | ||
'/ё/' => 'yo', | ||
'/Ю/' => 'Yu', | ||
'/ю/' => 'yu', | ||
'/Ж/' => 'Zh', | ||
'/ж/' => 'zh', | ||
'/ξ|Ξ/' => '3', | ||
'/θ/' => '8', | ||
'/ъ|ь|Ъ|Ы|Ь/' => '', | ||
); | ||
|
||
/** | ||
|
@@ -431,21 +462,20 @@ public static function tableize($class_name) | |
|
||
/** | ||
* Returns a string with all spaces converted to given replacement and | ||
* non word characters removed. Maps special characters to ASCII using | ||
* `Inflector::$transliteration` | ||
* non word characters removed. Maps special characters to ASCII using [[ascii()]] | ||
* @param string $string An arbitrary string to convert | ||
* @param string $replacement The replacement to use for spaces | ||
* @return string The converted string. | ||
*/ | ||
public static function slug($string, $replacement = '-') | ||
{ | ||
$map = static::$transliteration + array( | ||
'/[^\w\s]/' => ' ', | ||
'/\\s+/' => $replacement, | ||
'/(?<=[a-z])([A-Z])/' => $replacement . '\\1', | ||
str_replace(':rep', preg_quote($replacement, '/'), '/^[:rep]+|[:rep]+$/') => '' | ||
); | ||
return preg_replace(array_keys($map), array_values($map), $string); | ||
$map = array( | ||
'/[^\w\s]/' => ' ', | ||
'/\\s+/' => $replacement, | ||
'/(?<=[a-z])([A-Z])/' => $replacement . '\\1', | ||
str_replace(':rep', preg_quote($replacement, '/'), '/^[:rep]+|[:rep]+$/') => '' | ||
); | ||
return preg_replace(array_keys($map), array_values($map), static::ascii($string)); | ||
} | ||
|
||
/** | ||
|
@@ -475,4 +505,15 @@ public static function ordinalize($number) | |
default: return $number . 'th'; | ||
} | ||
} | ||
|
||
/**+ | ||
* Converts all special characters to the closest ascii character equivalent. | ||
* @param string $string the string to be converted. | ||
* @return string the translated | ||
*/ | ||
public static function ascii($string) | ||
{ | ||
$map = static::$transliteration; | ||
return preg_replace(array_keys($map), array_values($map), $string); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why turning this into protected?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
imo they should all be customizable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also transliteration prop have incorrect map. Please check this http://iamseanmurphy.com/creating-seo-friendly-urls-in-php-with-url-slug/ .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have inspired my self on that link, how come is incorrect?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tonydspaniard For example there is no rule for cyrilic
а
. Also for example rule'/ъ|ь|Ъ|Ы|Ь/' => ''
incorrect becauseЫ
should beY
. If you inspired by this link we should take these rules carefully ;)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tonydspaniard I think its good idea to make some kind of converter to automatic convert rules from link to this format.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with @cebe, working on this