Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

Add twttr.txt.modifyIndices{FromUTF16ToUnicode(), FromUnicodeToUTF16()} #39

Merged
merged 2 commits into from

2 participants

@keitaf
Collaborator

extract*() in twitter-text-js extracts entities with UTF-16 based indices where Unicode supplementary characters are counted as two characters.

However, Twitter API and twitter-text-rb produces indices based on Unicode where Unicode supplementary characters are counted as single characters.

This will add 2 new methods, twttr.txt.modifyIndicesFromUTF16ToUnicode() and twttr.txt.modifyIndicesFromUnicodeToUTF16(), which can be used to modify indices from UTF-16 based to Unicode based, and vise versa.

@j3h j3h commented on the diff
twitter-text.js
@@ -669,6 +669,30 @@ if (!window.twttr) {
return tags;
};
+ twttr.txt.modifyIndicesFromUnicodeToUTF16 = function(text, entities) {
+ twttr.txt.shiftIndices(text, entities, 1);
+ };
+
+ twttr.txt.modifyIndicesFromUTF16ToUnicode = function(text, entities) {
+ twttr.txt.shiftIndices(text, entities, -1);
+ };
+
+ twttr.txt.shiftIndices = function(text, entities, diff) {
+ for (var i = 0; i < text.length - 1; i++) {
+ var c1 = text.charCodeAt(i);
+ var c2 = text.charCodeAt(i + 1);
+ if (0xD800 <= c1 && c1 <= 0xDBFF && 0xDC00 <= c2 && c2 <= 0xDFFF) {
+ // supplementary character
@j3h Collaborator
j3h added a note

An i++ here would make explicit that we have already dealt with the next character as well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
@j3h
Collaborator
j3h commented

LGTM

@keitaf keitaf merged commit 3347d04 into from
@caniszczyk caniszczyk deleted the branch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Jan 31, 2012
  1. @keitaf
Commits on Feb 1, 2012
  1. @keitaf

    Remove 'countSupplementaryCharacterAsOne' option, and add modifyIndic…

    keitaf authored
    …es{FromUTF16ToUnicode, FromUnicodeToUTF16}.
This page is out of date. Refresh to see the latest.
Showing with 54 additions and 0 deletions.
  1. +30 −0 test/tests.js
  2. +24 −0 twitter-text.js
View
30 test/tests.js
@@ -33,6 +33,32 @@ test("twttr.txt.splitTags", function() {
}
});
+test("twttr.txt.extract", function() {
+ var text = "\uD801\uDC00 #hashtag \uD801\uDC00 #hashtag";
+ var extracted = twttr.txt.extractHashtagsWithIndices(text);
+ same(extracted, [{hashtag:"hashtag", indices:[3, 11]}, {hashtag:"hashtag", indices:[15, 23]}], "Hashtag w/ Supplementary character, UTF-16 indices");
+ twttr.txt.modifyIndicesFromUTF16ToUnicode(text, extracted);
+ same(extracted, [{hashtag:"hashtag", indices:[2, 10]}, {hashtag:"hashtag", indices:[13, 21]}], "Hashtag w/ Supplementary character, Unicode indices");
+ twttr.txt.modifyIndicesFromUnicodeToUTF16(text, extracted);
+ same(extracted, [{hashtag:"hashtag", indices:[3, 11]}, {hashtag:"hashtag", indices:[15, 23]}], "Hashtag w/ Supplementary character, UTF-16 indices");
+
+ text = "\uD801\uDC00 @mention \uD801\uDC00 @mention";
+ extracted = twttr.txt.extractMentionsOrListsWithIndices(text);
+ same(extracted, [{screenName:"mention", listSlug:"", indices:[3, 11]}, {screenName:"mention", listSlug:"", indices:[15, 23]}], "Mention w/ Supplementary character, UTF-16 indices");
+ twttr.txt.modifyIndicesFromUTF16ToUnicode(text, extracted);
+ same(extracted, [{screenName:"mention", listSlug:"", indices:[2, 10]}, {screenName:"mention", listSlug:"", indices:[13, 21]}], "Mention w/ Supplementary character");
+ twttr.txt.modifyIndicesFromUnicodeToUTF16(text, extracted);
+ same(extracted, [{screenName:"mention", listSlug:"", indices:[3, 11]}, {screenName:"mention", listSlug:"", indices:[15, 23]}], "Mention w/ Supplementary character, UTF-16 indices");
+
+ text = "\uD801\uDC00 http://twitter.com \uD801\uDC00 http://test.com";
+ extracted = twttr.txt.extractUrlsWithIndices(text);
+ same(extracted, [{url:"http://twitter.com", indices:[3, 21]}, {url:"http://test.com", indices:[25, 40]}], "URL w/ Supplementary character, UTF-16 indices");
+ twttr.txt.modifyIndicesFromUTF16ToUnicode(text, extracted);
+ same(extracted, [{url:"http://twitter.com", indices:[2, 20]}, {url:"http://test.com", indices:[23, 38]}], "URL w/ Supplementary character, Unicode indices");
+ twttr.txt.modifyIndicesFromUnicodeToUTF16(text, extracted);
+ same(extracted, [{url:"http://twitter.com", indices:[3, 21]}, {url:"http://test.com", indices:[25, 40]}], "URL w/ Supplementary character, UTF-16 indices");
+});
+
test("twttr.txt.autolink", function() {
// Username Overrides
ok(twttr.txt.autoLink("@tw", { before: "!" }).match(/!@<a[^>]+>tw<\/a>/), "Override before");
@@ -77,4 +103,8 @@ test("twttr.txt.autolink", function() {
for (i = 0; i < invalidChars.length; i++) {
equal(twttr.txt.extractUrls("http://twitt" + invalidChars[i] + "er.com").length, 0, 'Should not extract URL with invalid cahracter');
}
+
+ same(twttr.txt.autoLink("\uD801\uDC00 #hashtag \uD801\uDC00 @mention \uD801\uDC00 http://twitter.com"),
+ "\uD801\uDC00 <a href=\"http://twitter.com/#!/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\" rel=\"nofollow\">#hashtag</a> \uD801\uDC00 @<a class=\"tweet-url username\" data-screen-name=\"mention\" href=\"http://twitter.com/mention\" rel=\"nofollow\">mention</a> \uD801\uDC00 <a href=\"http://twitter.com\" rel=\"nofollow\" >http://twitter.com</a>",
+ "Autolink hashtag/mentionURL w/ Supplementary character");
});
View
24 twitter-text.js
@@ -669,6 +669,30 @@ if (!window.twttr) {
return tags;
};
+ twttr.txt.modifyIndicesFromUnicodeToUTF16 = function(text, entities) {
+ twttr.txt.shiftIndices(text, entities, 1);
+ };
+
+ twttr.txt.modifyIndicesFromUTF16ToUnicode = function(text, entities) {
+ twttr.txt.shiftIndices(text, entities, -1);
+ };
+
+ twttr.txt.shiftIndices = function(text, entities, diff) {
+ for (var i = 0; i < text.length - 1; i++) {
+ var c1 = text.charCodeAt(i);
+ var c2 = text.charCodeAt(i + 1);
+ if (0xD800 <= c1 && c1 <= 0xDBFF && 0xDC00 <= c2 && c2 <= 0xDFFF) {
+ // supplementary character
@j3h Collaborator
j3h added a note

An i++ here would make explicit that we have already dealt with the next character as well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ for (var j = 0; j < entities.length; j++) {
+ if (entities[j].indices[0] >= i) {
+ entities[j].indices[0] += diff;
+ entities[j].indices[1] += diff;
+ }
+ }
+ }
+ }
+ };
+
// this essentially does text.split(/<|>/)
// except that won't work in IE, where empty strings are ommitted
// so "<>".split(/<|>/) => [] in IE, but is ["", "", ""] in all others
Something went wrong with that request. Please try again.