From 29c08f179a797c97aa88750bdabab237729582d3 Mon Sep 17 00:00:00 2001 From: Matt Sanford Date: Wed, 13 Jul 2011 07:36:28 -0700 Subject: [PATCH] Roll version 1.4.4. with the Japanese fixes form Keita, latest conformance, and comma fix for the extraction bug --- pkg/twitter-text-1.4.4.js | 975 ++++++++++++++++++++++++++++++++++++++ twitter-text.js | 2 +- 2 files changed, 976 insertions(+), 1 deletion(-) create mode 100644 pkg/twitter-text-1.4.4.js diff --git a/pkg/twitter-text-1.4.4.js b/pkg/twitter-text-1.4.4.js new file mode 100644 index 0000000..1eeb0b1 --- /dev/null +++ b/pkg/twitter-text-1.4.4.js @@ -0,0 +1,975 @@ +/*! + * twitter-text-js 1.4.4 + * + * Copyright 2011 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this work except in compliance with the License. + * You may obtain a copy of the License below, or at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Apache License + * Version 2.0, January 2004 + * http://www.apache.org/licenses/ + * + * TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + * + * 1. Definitions. + * + * "License" shall mean the terms and conditions for use, reproduction, + * and distribution as defined by Sections 1 through 9 of this document. + * + * "Licensor" shall mean the copyright owner or entity authorized by + * the copyright owner that is granting the License. + * + * "Legal Entity" shall mean the union of the acting entity and all + * other entities that control, are controlled by, or are under common + * control with that entity. For the purposes of this definition, + * "control" means (i) the power, direct or indirect, to cause the + * direction or management of such entity, whether by contract or + * otherwise, or (ii) ownership of fifty percent (50%) or more of the + * outstanding shares, or (iii) beneficial ownership of such entity. + * + * "You" (or "Your") shall mean an individual or Legal Entity + * exercising permissions granted by this License. + * + * "Source" form shall mean the preferred form for making modifications, + * including but not limited to software source code, documentation + * source, and configuration files. + * + * "Object" form shall mean any form resulting from mechanical + * transformation or translation of a Source form, including but + * not limited to compiled object code, generated documentation, + * and conversions to other media types. + * + * "Work" shall mean the work of authorship, whether in Source or + * Object form, made available under the License, as indicated by a + * copyright notice that is included in or attached to the work + * (an example is provided in the Appendix below). + * + * "Derivative Works" shall mean any work, whether in Source or Object + * form, that is based on (or derived from) the Work and for which the + * editorial revisions, annotations, elaborations, or other modifications + * represent, as a whole, an original work of authorship. For the purposes + * of this License, Derivative Works shall not include works that remain + * separable from, or merely link (or bind by name) to the interfaces of, + * the Work and Derivative Works thereof. + * + * "Contribution" shall mean any work of authorship, including + * the original version of the Work and any modifications or additions + * to that Work or Derivative Works thereof, that is intentionally + * submitted to Licensor for inclusion in the Work by the copyright owner + * or by an individual or Legal Entity authorized to submit on behalf of + * the copyright owner. For the purposes of this definition, "submitted" + * means any form of electronic, verbal, or written communication sent + * to the Licensor or its representatives, including but not limited to + * communication on electronic mailing lists, source code control systems, + * and issue tracking systems that are managed by, or on behalf of, the + * Licensor for the purpose of discussing and improving the Work, but + * excluding communication that is conspicuously marked or otherwise + * designated in writing by the copyright owner as "Not a Contribution." + * + * "Contributor" shall mean Licensor and any individual or Legal Entity + * on behalf of whom a Contribution has been received by Licensor and + * subsequently incorporated within the Work. + * + * 2. Grant of Copyright License. Subject to the terms and conditions of + * this License, each Contributor hereby grants to You a perpetual, + * worldwide, non-exclusive, no-charge, royalty-free, irrevocable + * copyright license to reproduce, prepare Derivative Works of, + * publicly display, publicly perform, sublicense, and distribute the + * Work and such Derivative Works in Source or Object form. + * + * 3. Grant of Patent License. Subject to the terms and conditions of + * this License, each Contributor hereby grants to You a perpetual, + * worldwide, non-exclusive, no-charge, royalty-free, irrevocable + * (except as stated in this section) patent license to make, have made, + * use, offer to sell, sell, import, and otherwise transfer the Work, + * where such license applies only to those patent claims licensable + * by such Contributor that are necessarily infringed by their + * Contribution(s) alone or by combination of their Contribution(s) + * with the Work to which such Contribution(s) was submitted. If You + * institute patent litigation against any entity (including a + * cross-claim or counterclaim in a lawsuit) alleging that the Work + * or a Contribution incorporated within the Work constitutes direct + * or contributory patent infringement, then any patent licenses + * granted to You under this License for that Work shall terminate + * as of the date such litigation is filed. + * + * 4. Redistribution. You may reproduce and distribute copies of the + * Work or Derivative Works thereof in any medium, with or without + * modifications, and in Source or Object form, provided that You + * meet the following conditions: + * + * (a) You must give any other recipients of the Work or + * Derivative Works a copy of this License; and + * + * (b) You must cause any modified files to carry prominent notices + * stating that You changed the files; and + * + * (c) You must retain, in the Source form of any Derivative Works + * that You distribute, all copyright, patent, trademark, and + * attribution notices from the Source form of the Work, + * excluding those notices that do not pertain to any part of + * the Derivative Works; and + * + * (d) If the Work includes a "NOTICE" text file as part of its + * distribution, then any Derivative Works that You distribute must + * include a readable copy of the attribution notices contained + * within such NOTICE file, excluding those notices that do not + * pertain to any part of the Derivative Works, in at least one + * of the following places: within a NOTICE text file distributed + * as part of the Derivative Works; within the Source form or + * documentation, if provided along with the Derivative Works; or, + * within a display generated by the Derivative Works, if and + * wherever such third-party notices normally appear. The contents + * of the NOTICE file are for informational purposes only and + * do not modify the License. You may add Your own attribution + * notices within Derivative Works that You distribute, alongside + * or as an addendum to the NOTICE text from the Work, provided + * that such additional attribution notices cannot be construed + * as modifying the License. + * + * You may add Your own copyright statement to Your modifications and + * may provide additional or different license terms and conditions + * for use, reproduction, or distribution of Your modifications, or + * for any such Derivative Works as a whole, provided Your use, + * reproduction, and distribution of the Work otherwise complies with + * the conditions stated in this License. + * + * 5. Submission of Contributions. Unless You explicitly state otherwise, + * any Contribution intentionally submitted for inclusion in the Work + * by You to the Licensor shall be under the terms and conditions of + * this License, without any additional terms or conditions. + * Notwithstanding the above, nothing herein shall supersede or modify + * the terms of any separate license agreement you may have executed + * with Licensor regarding such Contributions. + * + * 6. Trademarks. This License does not grant permission to use the trade + * names, trademarks, service marks, or product names of the Licensor, + * except as required for reasonable and customary use in describing the + * origin of the Work and reproducing the content of the NOTICE file. + * + * 7. Disclaimer of Warranty. Unless required by applicable law or + * agreed to in writing, Licensor provides the Work (and each + * Contributor provides its Contributions) on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied, including, without limitation, any warranties or conditions + * of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + * PARTICULAR PURPOSE. You are solely responsible for determining the + * appropriateness of using or redistributing the Work and assume any + * risks associated with Your exercise of permissions under this License. + * + * 8. Limitation of Liability. In no event and under no legal theory, + * whether in tort (including negligence), contract, or otherwise, + * unless required by applicable law (such as deliberate and grossly + * negligent acts) or agreed to in writing, shall any Contributor be + * liable to You for damages, including any direct, indirect, special, + * incidental, or consequential damages of any character arising as a + * result of this License or out of the use or inability to use the + * Work (including but not limited to damages for loss of goodwill, + * work stoppage, computer failure or malfunction, or any and all + * other commercial damages or losses), even if such Contributor + * has been advised of the possibility of such damages. + * + * 9. Accepting Warranty or Additional Liability. While redistributing + * the Work or Derivative Works thereof, You may choose to offer, + * and charge a fee for, acceptance of support, warranty, indemnity, + * or other liability obligations and/or rights consistent with this + * License. However, in accepting such obligations, You may act only + * on Your own behalf and on Your sole responsibility, not on behalf + * of any other Contributor, and only if You agree to indemnify, + * defend, and hold each Contributor harmless for any liability + * incurred by, or claims asserted against, such Contributor by reason + * of your accepting any such warranty or additional liability. + */ + +if (!window.twttr) { + window.twttr = {}; +} + +(function() { + twttr.txt = {}; + twttr.txt.regexen = {}; + + var HTML_ENTITIES = { + '&': '&', + '>': '>', + '<': '<', + '"': '"', + "'": ' ' + }; + + // HTML escaping + twttr.txt.htmlEscape = function(text) { + return text && text.replace(/[&"'><]/g, function(character) { + return HTML_ENTITIES[character]; + }); + }; + + // Builds a RegExp + function regexSupplant(regex, flags) { + flags = flags || ""; + if (typeof regex !== "string") { + if (regex.global && flags.indexOf("g") < 0) { + flags += "g"; + } + if (regex.ignoreCase && flags.indexOf("i") < 0) { + flags += "i"; + } + if (regex.multiline && flags.indexOf("m") < 0) { + flags += "m"; + } + + regex = regex.source; + } + + return new RegExp(regex.replace(/#\{(\w+)\}/g, function(match, name) { + var newRegex = twttr.txt.regexen[name] || ""; + if (typeof newRegex !== "string") { + newRegex = newRegex.source; + } + return newRegex; + }), flags); + } + + // simple string interpolation + function stringSupplant(str, values) { + return str.replace(/#\{(\w+)\}/g, function(match, name) { + return values[name] || ""; + }); + } + + function addCharsToCharClass(charClass, start, end) { + for (var i = start; i <= end; i++) { + charClass.push(String.fromCharCode(i)); + } + return charClass; + } + + // Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand + // to access both the list of characters and a pattern suitible for use with String#split + // Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE + var fromCode = String.fromCharCode; + var UNICODE_SPACES = [ + fromCode(0x0020), // White_Space # Zs SPACE + fromCode(0x0085), // White_Space # Cc + fromCode(0x00A0), // White_Space # Zs NO-BREAK SPACE + fromCode(0x1680), // White_Space # Zs OGHAM SPACE MARK + fromCode(0x180E), // White_Space # Zs MONGOLIAN VOWEL SEPARATOR + fromCode(0x2028), // White_Space # Zl LINE SEPARATOR + fromCode(0x2029), // White_Space # Zp PARAGRAPH SEPARATOR + fromCode(0x202F), // White_Space # Zs NARROW NO-BREAK SPACE + fromCode(0x205F), // White_Space # Zs MEDIUM MATHEMATICAL SPACE + fromCode(0x3000) // White_Space # Zs IDEOGRAPHIC SPACE + ]; + addCharsToCharClass(UNICODE_SPACES, 0x009, 0x00D); // White_Space # Cc [5] .. + addCharsToCharClass(UNICODE_SPACES, 0x2000, 0x200A); // White_Space # Zs [11] EN QUAD..HAIR SPACE + + twttr.txt.regexen.spaces_group = regexSupplant(UNICODE_SPACES.join("")); + twttr.txt.regexen.spaces = regexSupplant("[" + UNICODE_SPACES.join("") + "]"); + twttr.txt.regexen.punct = /\!'#%&'\(\)*\+,\\\-\.\/:;<=>\?@\[\]\^_{|}~/; + twttr.txt.regexen.atSigns = /[@@]/; + twttr.txt.regexen.extractMentions = regexSupplant(/(^|[^a-zA-Z0-9_])(#{atSigns})([a-zA-Z0-9_]{1,20})(?=(.|$))/g); + twttr.txt.regexen.extractReply = regexSupplant(/^(?:#{spaces})*#{atSigns}([a-zA-Z0-9_]{1,20})/); + twttr.txt.regexen.listName = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/; + + var nonLatinHashtagChars = []; + // Cyrillic + addCharsToCharClass(nonLatinHashtagChars, 0x0400, 0x04ff); // Cyrillic + addCharsToCharClass(nonLatinHashtagChars, 0x0500, 0x0527); // Cyrillic Supplement + // Hangul (Korean) + addCharsToCharClass(nonLatinHashtagChars, 0x1100, 0x11ff); // Hangul Jamo + addCharsToCharClass(nonLatinHashtagChars, 0x3130, 0x3185); // Hangul Compatibility Jamo + addCharsToCharClass(nonLatinHashtagChars, 0xA960, 0xA97F); // Hangul Jamo Extended-A + addCharsToCharClass(nonLatinHashtagChars, 0xAC00, 0xD7AF); // Hangul Syllables + addCharsToCharClass(nonLatinHashtagChars, 0xD7B0, 0xD7FF); // Hangul Jamo Extended-B + // Japanese and Chinese + addCharsToCharClass(nonLatinHashtagChars, 0x30A1, 0x30FA); // Katakana (full-width) + addCharsToCharClass(nonLatinHashtagChars, 0x30FC, 0x30FC); // Katakana Chouon (full-width) + addCharsToCharClass(nonLatinHashtagChars, 0xFF66, 0xFF9F); // Katakana (half-width) + addCharsToCharClass(nonLatinHashtagChars, 0xFF70, 0xFF70); // Katakana Chouon (half-width) + addCharsToCharClass(nonLatinHashtagChars, 0xFF10, 0xFF19); // \ + addCharsToCharClass(nonLatinHashtagChars, 0xFF21, 0xFF3A); // - Latin (full-width) + addCharsToCharClass(nonLatinHashtagChars, 0xFF41, 0xFF5A); // / + addCharsToCharClass(nonLatinHashtagChars, 0x3041, 0x3096); // Hiragana + addCharsToCharClass(nonLatinHashtagChars, 0x3400, 0x4DBF); // Kanji (CJK Extension A) + addCharsToCharClass(nonLatinHashtagChars, 0x4E00, 0x9FFF); // Kanji (Unified) + // -- Disabled as it breaks the Regex. + //addCharsToCharClass(nonLatinHashtagChars, 0x20000, 0x2A6DF); // Kanji (CJK Extension B) + addCharsToCharClass(nonLatinHashtagChars, 0x2A700, 0x2B73F); // Kanji (CJK Extension C) + addCharsToCharClass(nonLatinHashtagChars, 0x2B740, 0x2B81F); // Kanji (CJK Extension D) + addCharsToCharClass(nonLatinHashtagChars, 0x2F800, 0x2FA1F); // Kanji (CJK supplement) + addCharsToCharClass(nonLatinHashtagChars, 0x3005, 0x3005); // Kanji (CJK iteration mark) + + twttr.txt.regexen.nonLatinHashtagChars = regexSupplant(nonLatinHashtagChars.join("")); + // Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x") + twttr.txt.regexen.latinAccentChars = regexSupplant("ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ\\303\\277"); + twttr.txt.regexen.latenAccents = regexSupplant(/[#{latinAccentChars}]+/); + + twttr.txt.regexen.endScreenNameMatch = regexSupplant(/^(?:#{atSigns}|[#{latinAccentChars}]|:\/\/)/); + + // A hashtag must contain characters, numbers and underscores, but not all numbers. + twttr.txt.regexen.hashtagBoundary = regexSupplant(/(?:^|$|#{spaces}|「|」|。|、|\.|!|!|\?|?|,)/); + twttr.txt.regexen.hashtagAlpha = regexSupplant(/[a-z_#{latinAccentChars}#{nonLatinHashtagChars}]/i); + twttr.txt.regexen.hashtagAlphaNumeric = regexSupplant(/[a-z0-9_#{latinAccentChars}#{nonLatinHashtagChars}]/i); + twttr.txt.regexen.autoLinkHashtags = regexSupplant(/(#{hashtagBoundary})(#|#)(#{hashtagAlphaNumeric}*#{hashtagAlpha}#{hashtagAlphaNumeric}*)/gi); + twttr.txt.regexen.autoLinkUsernamesOrLists = /(^|[^a-zA-Z0-9_]|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/g; + twttr.txt.regexen.autoLinkEmoticon = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/g; + + // URL related hash regex collection + twttr.txt.regexen.invalidDomainChars = stringSupplant("\u00A0#{punct}#{spaces_group}", twttr.txt.regexen); + twttr.txt.regexen.validPrecedingChars = regexSupplant(/(?:[^-\/"':!=A-Za-z0-9_@@]|^|\:)/); + + twttr.txt.regexen.validSubdomain = regexSupplant(/(?:[^#{invalidDomainChars}](?:[_-]|[^#{invalidDomainChars}])*)?[^#{invalidDomainChars}]\./); + twttr.txt.regexen.validDomainName = regexSupplant(/(?:[^#{invalidDomainChars}](?:[-]|[^#{invalidDomainChars}])*)?[^#{invalidDomainChars}]/); + twttr.txt.regexen.validDomain = regexSupplant(/(#{validSubdomain})*#{validDomainName}\.(?:xn--[a-z0-9]{2,}|[a-z]{2,})(?::[0-9]+)?/i); + + twttr.txt.regexen.validGeneralUrlPathChars = /[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~|\.]/i; + // Allow URL paths to contain balanced parens + // 1. Used in Wikipedia URLs like /Primer_(film) + // 2. Used in IIS sessions like /S(dfd346)/ + twttr.txt.regexen.wikipediaDisambiguation = regexSupplant(/(?:\(#{validGeneralUrlPathChars}+\))/i); + // Allow @ in a url, but only in the middle. Catch things like http://example.com/@user + twttr.txt.regexen.validUrlPathChars = regexSupplant(/(?:#{wikipediaDisambiguation}|@#{validGeneralUrlPathChars}+\/|[\.,]?#{validGeneralUrlPathChars})/i); + + // Valid end-of-path chracters (so /foo. does not gobble the period). + // 1. Allow =&# for empty URL parameters and other URL-join artifacts + twttr.txt.regexen.validUrlPathEndingChars = regexSupplant(/(?:[\+\-a-z0-9=_#\/]|#{wikipediaDisambiguation})/i); + twttr.txt.regexen.validUrlQueryChars = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|]/i; + twttr.txt.regexen.validUrlQueryEndingChars = /[a-z0-9_&=#\/]/i; + twttr.txt.regexen.extractUrl = regexSupplant( + '(' + // $1 total match + '(#{validPrecedingChars})' + // $2 Preceeding chracter + '(' + // $3 URL + '(https?:\\/\\/)' + // $4 Protocol + '(#{validDomain})' + // $5 Domain(s) and optional post number + '(\\/' + // $6 URL Path + '(?:' + + '#{validUrlPathChars}+#{validUrlPathEndingChars}|' + + '#{validUrlPathChars}+#{validUrlPathEndingChars}?|' + + '#{validUrlPathEndingChars}' + + ')?' + + ')?' + + '(\\?#{validUrlQueryChars}*#{validUrlQueryEndingChars})?' + // $7 Query String + ')' + + ')' + , "gi"); + + + // These URL validation pattern strings are based on the ABNF from RFC 3986 + twttr.txt.regexen.validateUrlUnreserved = /[a-z0-9\-._~]/i; + twttr.txt.regexen.validateUrlPctEncoded = /(?:%[0-9a-f]{2})/i; + twttr.txt.regexen.validateUrlSubDelims = /[!$&'()*+,;=]/i; + twttr.txt.regexen.validateUrlPchar = regexSupplant('(?:' + + '#{validateUrlUnreserved}|' + + '#{validateUrlPctEncoded}|' + + '#{validateUrlSubDelims}|' + + ':|@' + + ')', 'i'); + + twttr.txt.regexen.validateUrlScheme = /(?:[a-z][a-z0-9+\-.]*)/i; + twttr.txt.regexen.validateUrlUserinfo = regexSupplant('(?:' + + '#{validateUrlUnreserved}|' + + '#{validateUrlPctEncoded}|' + + '#{validateUrlSubDelims}|' + + ':' + + ')*', 'i'); + + twttr.txt.regexen.validateUrlDecOctet = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i; + twttr.txt.regexen.validateUrlIpv4 = regexSupplant(/(?:#{validateUrlDecOctet}(?:\.#{validateUrlDecOctet}){3})/i); + + // Punting on real IPv6 validation for now + twttr.txt.regexen.validateUrlIpv6 = /(?:\[[a-f0-9:\.]+\])/i; + + // Also punting on IPvFuture for now + twttr.txt.regexen.validateUrlIp = regexSupplant('(?:' + + '#{validateUrlIpv4}|' + + '#{validateUrlIpv6}' + + ')', 'i'); + + // This is more strict than the rfc specifies + twttr.txt.regexen.validateUrlSubDomainSegment = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i; + twttr.txt.regexen.validateUrlDomainSegment = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i; + twttr.txt.regexen.validateUrlDomainTld = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i; + twttr.txt.regexen.validateUrlDomain = regexSupplant(/(?:(?:#{validateUrlSubDomainSegment]}\.)*(?:#{validateUrlDomainSegment]}\.)#{validateUrlDomainTld})/i); + + twttr.txt.regexen.validateUrlHost = regexSupplant('(?:' + + '#{validateUrlIp}|' + + '#{validateUrlDomain}' + + ')', 'i'); + + // Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences + twttr.txt.regexen.validateUrlUnicodeSubDomainSegment = /(?:(?:[a-z0-9]|[^\u0000-\u007f])(?:(?:[a-z0-9_\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i; + twttr.txt.regexen.validateUrlUnicodeDomainSegment = /(?:(?:[a-z0-9]|[^\u0000-\u007f])(?:(?:[a-z0-9\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i; + twttr.txt.regexen.validateUrlUnicodeDomainTld = /(?:(?:[a-z]|[^\u0000-\u007f])(?:(?:[a-z0-9\-]|[^\u0000-\u007f])*(?:[a-z0-9]|[^\u0000-\u007f]))?)/i; + twttr.txt.regexen.validateUrlUnicodeDomain = regexSupplant(/(?:(?:#{validateUrlUnicodeSubDomainSegment}\.)*(?:#{validateUrlUnicodeDomainSegment}\.)#{validateUrlUnicodeDomainTld})/i); + + twttr.txt.regexen.validateUrlUnicodeHost = regexSupplant('(?:' + + '#{validateUrlIp}|' + + '#{validateUrlUnicodeDomain}' + + ')', 'i'); + + twttr.txt.regexen.validateUrlPort = /[0-9]{1,5}/; + + twttr.txt.regexen.validateUrlUnicodeAuthority = regexSupplant( + '(?:(#{validateUrlUserinfo})@)?' + // $1 userinfo + '(#{validateUrlUnicodeHost})' + // $2 host + '(?::(#{validateUrlPort}))?' //$3 port + , "i"); + + twttr.txt.regexen.validateUrlAuthority = regexSupplant( + '(?:(#{validateUrlUserinfo})@)?' + // $1 userinfo + '(#{validateUrlHost})' + // $2 host + '(?::(#{validateUrlPort}))?' // $3 port + , "i"); + + twttr.txt.regexen.validateUrlPath = regexSupplant(/(\/#{validateUrlPchar}*)*/i); + twttr.txt.regexen.validateUrlQuery = regexSupplant(/(#{validateUrlPchar}|\/|\?)*/i); + twttr.txt.regexen.validateUrlFragment = regexSupplant(/(#{validateUrlPchar}|\/|\?)*/i); + + // Modified version of RFC 3986 Appendix B + twttr.txt.regexen.validateUrlUnencoded = regexSupplant( + '^' + // Full URL + '(?:' + + '([^:/?#]+):' + // $1 Scheme + ')' + + '(?://' + + '([^/?#]*)' + // $2 Authority + ')' + + '([^?#]*)' + // $3 Path + '(?:' + + '\\?([^#]*)' + // $4 Query + ')?' + + '(?:' + + '#(.*)' + // $5 Fragment + ')?$' + , "i"); + + + // Default CSS class for auto-linked URLs + var DEFAULT_URL_CLASS = "tweet-url"; + // Default CSS class for auto-linked lists (along with the url class) + var DEFAULT_LIST_CLASS = "list-slug"; + // Default CSS class for auto-linked usernames (along with the url class) + var DEFAULT_USERNAME_CLASS = "username"; + // Default CSS class for auto-linked hashtags (along with the url class) + var DEFAULT_HASHTAG_CLASS = "hashtag"; + // HTML attribute for robot nofollow behavior (default) + var HTML_ATTR_NO_FOLLOW = " rel=\"nofollow\""; + + // Simple object cloning function for simple objects + function clone(o) { + var r = {}; + for (var k in o) { + if (o.hasOwnProperty(k)) { + r[k] = o[k]; + } + } + + return r; + } + + twttr.txt.autoLink = function(text, options) { + options = clone(options || {}); + return twttr.txt.autoLinkUsernamesOrLists( + twttr.txt.autoLinkUrlsCustom( + twttr.txt.autoLinkHashtags(text, options), + options), + options); + }; + + + twttr.txt.autoLinkUsernamesOrLists = function(text, options) { + options = clone(options || {}); + + options.urlClass = options.urlClass || DEFAULT_URL_CLASS; + options.listClass = options.listClass || DEFAULT_LIST_CLASS; + options.usernameClass = options.usernameClass || DEFAULT_USERNAME_CLASS; + options.usernameUrlBase = options.usernameUrlBase || "http://twitter.com/"; + options.listUrlBase = options.listUrlBase || "http://twitter.com/"; + if (!options.suppressNoFollow) { + var extraHtml = HTML_ATTR_NO_FOLLOW; + } + + var newText = "", + splitText = twttr.txt.splitTags(text); + + for (var index = 0; index < splitText.length; index++) { + var chunk = splitText[index]; + + if (index !== 0) { + newText += ((index % 2 === 0) ? ">" : "<"); + } + + if (index % 4 !== 0) { + newText += chunk; + } else { + newText += chunk.replace(twttr.txt.regexen.autoLinkUsernamesOrLists, function(match, before, at, user, slashListname, offset, chunk) { + var after = chunk.slice(offset + match.length); + + var d = { + before: before, + at: at, + user: twttr.txt.htmlEscape(user), + slashListname: twttr.txt.htmlEscape(slashListname), + extraHtml: extraHtml, + preChunk: "", + chunk: twttr.txt.htmlEscape(chunk), + postChunk: "" + }; + for (var k in options) { + if (options.hasOwnProperty(k)) { + d[k] = options[k]; + } + } + + if (slashListname && !options.suppressLists) { + // the link is a list + var list = d.chunk = stringSupplant("#{user}#{slashListname}", d); + d.list = twttr.txt.htmlEscape(list.toLowerCase()); + return stringSupplant("#{before}#{at}#{chunk}", d); + } else { + if (after && after.match(twttr.txt.regexen.endScreenNameMatch)) { + // Followed by something that means we don't autolink + return match; + } else { + // this is a screen name + d.chunk = twttr.txt.htmlEscape(user); + d.dataScreenName = !options.suppressDataScreenName ? stringSupplant("data-screen-name=\"#{chunk}\" ", d) : ""; + return stringSupplant("#{before}#{at}#{preChunk}#{chunk}#{postChunk}", d); + } + } + }); + } + } + + return newText; + }; + + twttr.txt.autoLinkHashtags = function(text, options) { + options = clone(options || {}); + options.urlClass = options.urlClass || DEFAULT_URL_CLASS; + options.hashtagClass = options.hashtagClass || DEFAULT_HASHTAG_CLASS; + options.hashtagUrlBase = options.hashtagUrlBase || "http://twitter.com/search?q=%23"; + if (!options.suppressNoFollow) { + var extraHtml = HTML_ATTR_NO_FOLLOW; + } + + return text.replace(twttr.txt.regexen.autoLinkHashtags, function(match, before, hash, text) { + var d = { + before: before, + hash: twttr.txt.htmlEscape(hash), + preText: "", + text: twttr.txt.htmlEscape(text), + postText: "", + extraHtml: extraHtml + }; + + for (var k in options) { + if (options.hasOwnProperty(k)) { + d[k] = options[k]; + } + } + + return stringSupplant("#{before}#{hash}#{preText}#{text}#{postText}", d); + }); + }; + + + twttr.txt.autoLinkUrlsCustom = function(text, options) { + options = clone(options || {}); + if (!options.suppressNoFollow) { + options.rel = "nofollow"; + } + if (options.urlClass) { + options["class"] = options.urlClass; + delete options.urlClass; + } + + delete options.suppressNoFollow; + delete options.suppressDataScreenName; + + return text.replace(twttr.txt.regexen.extractUrl, function(match, all, before, url, protocol, domain, path, queryString) { + var tldComponents; + + if (protocol) { + var htmlAttrs = ""; + for (var k in options) { + htmlAttrs += stringSupplant(" #{k}=\"#{v}\" ", {k: k, v: options[k].toString().replace(/"/, """).replace(//, ">")}); + } + + var d = { + before: before, + htmlAttrs: htmlAttrs, + url: twttr.txt.htmlEscape(url) + }; + + return stringSupplant("#{before}#{url}", d); + } else { + return all; + } + }); + }; + + twttr.txt.extractMentions = function(text) { + var screenNamesOnly = [], + screenNamesWithIndices = twttr.txt.extractMentionsWithIndices(text); + + for (var i = 0; i < screenNamesWithIndices.length; i++) { + var screenName = screenNamesWithIndices[i].screenName; + screenNamesOnly.push(screenName); + } + + return screenNamesOnly; + }; + + twttr.txt.extractMentionsWithIndices = function(text) { + if (!text) { + return []; + } + + var possibleScreenNames = [], + position = 0; + + text.replace(twttr.txt.regexen.extractMentions, function(match, before, atSign, screenName, after) { + if (!after.match(twttr.txt.regexen.endScreenNameMatch)) { + var startPosition = text.indexOf(atSign + screenName, position); + position = startPosition + screenName.length + 1; + possibleScreenNames.push({ + screenName: screenName, + indices: [startPosition, position] + }); + } + }); + + return possibleScreenNames; + }; + + twttr.txt.extractReplies = function(text) { + if (!text) { + return null; + } + + var possibleScreenName = text.match(twttr.txt.regexen.extractReply); + if (!possibleScreenName) { + return null; + } + + return possibleScreenName[1]; + }; + + twttr.txt.extractUrls = function(text) { + var urlsOnly = [], + urlsWithIndices = twttr.txt.extractUrlsWithIndices(text); + + for (var i = 0; i < urlsWithIndices.length; i++) { + urlsOnly.push(urlsWithIndices[i].url); + } + + return urlsOnly; + }; + + twttr.txt.extractUrlsWithIndices = function(text) { + if (!text) { + return []; + } + + var urls = [], + position = 0; + + text.replace(twttr.txt.regexen.extractUrl, function(match, all, before, url, protocol, domain, path, query) { + var tldComponents; + + if (protocol) { + var startPosition = text.indexOf(url, position), + position = startPosition + url.length; + + urls.push({ + url: url, + indices: [startPosition, position] + }); + } + }); + + return urls; + }; + + twttr.txt.extractHashtags = function(text) { + var hashtagsOnly = [], + hashtagsWithIndices = twttr.txt.extractHashtagsWithIndices(text); + + for (var i = 0; i < hashtagsWithIndices.length; i++) { + hashtagsOnly.push(hashtagsWithIndices[i].hashtag); + } + + return hashtagsOnly; + }; + + twttr.txt.extractHashtagsWithIndices = function(text) { + if (!text) { + return []; + } + + var tags = [], + position = 0; + + text.replace(twttr.txt.regexen.autoLinkHashtags, function(match, before, hash, hashText) { + var startPosition = text.indexOf(hash + hashText, position); + position = startPosition + hashText.length + 1; + tags.push({ + hashtag: hashText, + indices: [startPosition, position] + }); + }); + + return tags; + }; + + // this essentially does text.split(/<|>/) + // except that won't work in IE, where empty strings are ommitted + // so "<>".split(/<|>/) => [] in IE, but is ["", "", ""] in all others + // but "<<".split("<") => ["", "", ""] + twttr.txt.splitTags = function(text) { + var firstSplits = text.split("<"), + secondSplits, + allSplits = [], + split; + + for (var i = 0; i < firstSplits.length; i += 1) { + split = firstSplits[i]; + if (!split) { + allSplits.push(""); + } else { + secondSplits = split.split(">"); + for (var j = 0; j < secondSplits.length; j += 1) { + allSplits.push(secondSplits[j]); + } + } + } + + return allSplits; + }; + + twttr.txt.hitHighlight = function(text, hits, options) { + var defaultHighlightTag = "em"; + + hits = hits || []; + options = options || {}; + + if (hits.length === 0) { + return text; + } + + var tagName = options.tag || defaultHighlightTag, + tags = ["<" + tagName + ">", ""], + chunks = twttr.txt.splitTags(text), + split, + i, + j, + result = "", + chunkIndex = 0, + chunk = chunks[0], + prevChunksLen = 0, + chunkCursor = 0, + startInChunk = false, + chunkChars = chunk, + flatHits = [], + index, + hit, + tag, + placed, + hitSpot; + + for (i = 0; i < hits.length; i += 1) { + for (j = 0; j < hits[i].length; j += 1) { + flatHits.push(hits[i][j]); + } + } + + for (index = 0; index < flatHits.length; index += 1) { + hit = flatHits[index]; + tag = tags[index % 2]; + placed = false; + + while (chunk != null && hit >= prevChunksLen + chunk.length) { + result += chunkChars.slice(chunkCursor); + if (startInChunk && hit === prevChunksLen + chunkChars.length) { + result += tag; + placed = true; + } + + if (chunks[chunkIndex + 1]) { + result += "<" + chunks[chunkIndex + 1] + ">"; + } + + prevChunksLen += chunkChars.length; + chunkCursor = 0; + chunkIndex += 2; + chunk = chunks[chunkIndex]; + chunkChars = chunk; + startInChunk = false; + } + + if (!placed && chunk != null) { + hitSpot = hit - prevChunksLen; + result += chunkChars.slice(chunkCursor, hitSpot) + tag; + chunkCursor = hitSpot; + if (index % 2 === 0) { + startInChunk = true; + } else { + startInChunk = false; + } + } else if(!placed) { + placed = true; + result += tag; + } + } + + if (chunk != null) { + if (chunkCursor < chunkChars.length) { + result += chunkChars.slice(chunkCursor); + } + for (index = chunkIndex + 1; index < chunks.length; index += 1) { + result += (index % 2 === 0 ? chunks[index] : "<" + chunks[index] + ">"); + } + } + + return result; + }; + + var MAX_LENGTH = 140; + + // Characters not allowed in Tweets + var INVALID_CHARACTERS = [ + // BOM + fromCode(0xFFFE), + fromCode(0xFEFF), + + // Special + fromCode(0xFFFF), + + // Directional Change + fromCode(0x202A), + fromCode(0x202B), + fromCode(0x202C), + fromCode(0x202D), + fromCode(0x202E) + ]; + + // Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation + // before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation + // will allow quicker feedback. + // + // Returns false if this text is valid. Otherwise one of the following strings will be returned: + // + // "too_long": if the text is too long + // "empty": if the text is nil or empty + // "invalid_characters": if the text contains non-Unicode or any of the disallowed Unicode characters + twttr.txt.isInvalidTweet = function(text) { + if (!text) { + return "empty"; + } + + if (text.length > MAX_LENGTH) { + return "too_long"; + } + + for (var i = 0; i < INVALID_CHARACTERS.length; i++) { + if (text.indexOf(INVALID_CHARACTERS[i]) >= 0) { + return "invalid_characters"; + } + } + + return false + }; + + twttr.txt.isValidTweetText = function(text) { + return !twttr.txt.isInvalidTweet(text); + }; + + twttr.txt.isValidUsername = function(username) { + if (!username) { + return false; + } + + var extracted = twttr.txt.extractMentions(username); + + // Should extract the username minus the @ sign, hence the .slice(1) + return extracted.length === 1 && extracted[0] === username.slice(1); + }; + + var VALID_LIST_RE = regexSupplant(/^#{autoLinkUsernamesOrLists}$/); + + twttr.txt.isValidList = function(usernameList) { + var match = usernameList.match(VALID_LIST_RE); + + // Must have matched and had nothing before or after + return !!(match && match[1] == "" && match[4]); + }; + + twttr.txt.isValidHashtag = function(hashtag) { + if (!hashtag) { + return false; + } + + var extracted = twttr.txt.extractHashtags(hashtag); + + // Should extract the hashtag minus the # sign, hence the .slice(1) + return extracted.length === 1 && extracted[0] === hashtag.slice(1); + }; + + twttr.txt.isValidUrl = function(url, unicodeDomains) { + if (unicodeDomains == null) { + unicodeDomains = true; + } + + if (!url) { + return false; + } + + var urlParts = url.match(twttr.txt.regexen.validateUrlUnencoded); + + if (!urlParts || urlParts[0] !== url) { + return false; + } + + var scheme = urlParts[1], + authority = urlParts[2], + path = urlParts[3], + query = urlParts[4], + fragment = urlParts[5]; + + if (!( + isValidMatch(scheme, twttr.txt.regexen.validateUrlScheme) && scheme.match(/^https?$/i) && + isValidMatch(path, twttr.txt.regexen.validateUrlPath) && + isValidMatch(query, twttr.txt.regexen.validateUrlQuery, true) && + isValidMatch(fragment, twttr.txt.regexen.validateUrlFragment, true) + )) { + return false; + } + + return (unicodeDomains && isValidMatch(authority, twttr.txt.regexen.validateUrlUnicodeAuthority)) || + (!unicodeDomains && isValidMatch(authority, twttr.txt.regexen.validateUrlAuthority)); + }; + + function isValidMatch(string, regex, optional) { + if (!optional) { + // RegExp["$&"] is the text of the last match + // blank strings are ok, but are falsy, so we check stringiness instead of truthiness + return ((typeof string === "string") && string.match(regex) && RegExp["$&"] === string); + } + + // RegExp["$&"] is the text of the last match + return (!string || (string.match(regex) && RegExp["$&"] === string)); + } + + +}()); diff --git a/twitter-text.js b/twitter-text.js index c00beba..0741aa2 100644 --- a/twitter-text.js +++ b/twitter-text.js @@ -124,7 +124,7 @@ if (!window.twttr) { twttr.txt.regexen.endScreenNameMatch = regexSupplant(/^(?:#{atSigns}|[#{latinAccentChars}]|:\/\/)/); // A hashtag must contain characters, numbers and underscores, but not all numbers. - twttr.txt.regexen.hashtagBoundary = regexSupplant(/(?:^|$|#{spaces}|「|」|。|、|\.|!|!|\?|?)/); + twttr.txt.regexen.hashtagBoundary = regexSupplant(/(?:^|$|#{spaces}|「|」|。|、|\.|!|!|\?|?|,)/); twttr.txt.regexen.hashtagAlpha = regexSupplant(/[a-z_#{latinAccentChars}#{nonLatinHashtagChars}]/i); twttr.txt.regexen.hashtagAlphaNumeric = regexSupplant(/[a-z0-9_#{latinAccentChars}#{nonLatinHashtagChars}]/i); twttr.txt.regexen.autoLinkHashtags = regexSupplant(/(#{hashtagBoundary})(#|#)(#{hashtagAlphaNumeric}*#{hashtagAlpha}#{hashtagAlphaNumeric}*)/gi);