Skip to content
This repository has been archived by the owner on Sep 18, 2021. It is now read-only.

Commit

Permalink
Merge pull request #58 from twitter/extend_gtld_and_cctld
Browse files Browse the repository at this point in the history
Extend gTLD and ccTDL list (include Internationalized Domain Name)
  • Loading branch information
jakl committed Mar 11, 2014
2 parents 122dfa2 + d1acbbe commit ca5f893
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 18 deletions.
39 changes: 23 additions & 16 deletions src/com/twitter/Regex.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,23 +60,30 @@ public class Regex {
private static final String URL_VALID_UNICODE_CHARS = "[.[^\\p{Punct}\\s\\p{Z}\\p{InGeneralPunctuation}]]";

private static final String URL_VALID_GTLD =
"(?:(?:academy|aero|agency|arpa|asia|bargains|berlin|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|careers|cat|catering|" +
"center|ceo|cheap|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|" +
"dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|" +
"flights|florist|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|info|institute|int|international|" +
"jobs|kaufen|kim|kitchen|kiwi|land|lighting|limo|link|luxury|management|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|" +
"ninja|onl|org|partners|photo|photography|photos|pics|pink|plumbing|post|pro|properties|recipes|red|rentals|repair|report|rich|ruhr|" +
"sexy|shiksha|shoes|singles|social|solar|solutions|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|" +
"travel|uno|ventures|viajes|voting|voyage|wang|watch|wed|wien|works|xxx|zone)(?=[^\\p{Alnum}@]|$))";
"(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|" +
"buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|" +
"coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|" +
"diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|" +
"flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|" +
"immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|" +
"limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|" +
"okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|" +
"pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|" +
"solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|" +
"travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|" +
"xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)" +
"(?=[^\\p{Alnum}@]|$))";
private static final String URL_VALID_CCTLD =
"(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" +
"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" +
"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" +
"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" +
"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" +
"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" +
"sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" +
"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)(?=[^\\p{Alnum}@]|$))";
"(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|" +
"bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|" +
"et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|" +
"im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|" +
"me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|" +
"pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|" +
"sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|" +
"ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|" +
"भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中國|台湾|台灣|新加坡|" +
"香港|한국)(?=[^\\p{Alnum}@]|$))";
private static final String URL_PUNYCODE = "(?:xn--[0-9a-z]+)";

private static final String URL_VALID_DOMAIN =
Expand Down
2 changes: 1 addition & 1 deletion test-data/twitter-text-conformance
Submodule twitter-text-conformance updated from dc8e01 to 80a225
5 changes: 4 additions & 1 deletion tests/com/twitter/RegexTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ public void testAutoLinkUsernamesOrLists() {

public void testValidURL() {
assertCaptureCount(8, Regex.VALID_URL, "http://example.com");
assertCaptureCount(8, Regex.VALID_URL, "http://はじめよう.みんな");
assertCaptureCount(8, Regex.VALID_URL, "http://はじめよう.香港");
assertCaptureCount(8, Regex.VALID_URL, "http://はじめよう.الجزائر");
}

public void testValidURLDoesNotCrashOnLongPaths() {
Expand Down Expand Up @@ -86,7 +89,7 @@ public void testValidURLWithoutProtocol() {
Regex.VALID_URL.matcher("t.co").matches());

assertFalse("Should not match a URL with invalid gTLD.",
Regex.VALID_URL.matcher("www.foo.bar").find());
Regex.VALID_URL.matcher("www.foo.baz").find());

assertTrue("Match a short URL with ccTLD and '/' but without protocol.",
Regex.VALID_URL.matcher("t.co/blahblah").matches());
Expand Down

0 comments on commit ca5f893

Please sign in to comment.