Permalink
Browse files

Merge pull request #45 from twitter/fix_utf16_index

Fix a bug in modifyIndicesFromUnicodeToUTF16/modifyIndicesFromUTF16ToUnicode
  • Loading branch information...
2 parents f95f96f + d390ff2 commit 8bf6b07de7f16c31f70511f26eb8fb4890aa7fe2 @keitaf keitaf committed Feb 15, 2012
Showing with 76 additions and 14 deletions.
  1. +38 −0 test/tests.js
  2. +38 −14 twitter-text.js
View
@@ -57,6 +57,44 @@ test("twttr.txt.extract", function() {
same(extracted, [{url:"http://twitter.com", indices:[2, 20]}, {url:"http://test.com", indices:[23, 38]}], "URL w/ Supplementary character, Unicode indices");
twttr.txt.modifyIndicesFromUnicodeToUTF16(text, extracted);
same(extracted, [{url:"http://twitter.com", indices:[3, 21]}, {url:"http://test.com", indices:[25, 40]}], "URL w/ Supplementary character, UTF-16 indices");
+
+ var testCases = [
+ {text:"abc", indices:[[0,3]], unicode_indices:[[0,3]]},
+ {text:"\uD83D\uDE02abc", indices:[[2,5]], unicode_indices:[[1,4]]},
+ {text:"\uD83D\uDE02abc\uD83D\uDE02", indices:[[2,5]], unicode_indices:[[1,4]]},
+ {text:"\uD83D\uDE02abc\uD838\uDE02abc", indices:[[2,5], [7,10]], unicode_indices:[[1,4], [5,8]]},
+ {text:"\uD83D\uDE02abc\uD838\uDE02abc\uD83D\uDE02", indices:[[2,5], [7,10]], unicode_indices:[[1,4], [5,8]]},
+ {text:"\uD83D\uDE02\uD83D\uDE02abc", indices:[[4,7]], unicode_indices:[[2,5]]},
+ {text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc", indices:[[6,9]], unicode_indices:[[3,6]]},
+ {text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02", indices:[[6,9]], unicode_indices:[[3,6]]},
+ {text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02", indices:[[6,9]], unicode_indices:[[3,6]]},
+ {text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02", indices:[[8,11], [19,22]], unicode_indices:[[4,7], [11,14]]},
+ {text:"\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02", indices:[[7,10], [18,21]], unicode_indices:[[4,7], [11,14]]},
+ {text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uDE02", indices:[[8,11], [19,22]], unicode_indices:[[4,7], [11,14]]},
+ {text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uD83D\uDE02\uDE02", indices:[[8,11], [19,22]], unicode_indices:[[4,7], [11,14]]},
+ {text:"\uD83Dabc\uD83D", indices:[[1,4]], unicode_indices:[[1,4]]},
+ {text:"\uDE02abc\uDE02", indices:[[1,4]], unicode_indices:[[1,4]]},
+ {text:"\uDE02\uDE02abc\uDE02\uDE02", indices:[[2,5]], unicode_indices:[[2,5]]},
+ {text:"abcabc", indices:[[0,3], [3,6]], unicode_indices:[[0,3], [3,6]]},
+ {text:"abc\uD83D\uDE02abc", indices:[[0,3], [5,8]], unicode_indices:[[0,3], [4,7]]},
+ {text:"aa", indices:[[0,1], [1,2]], unicode_indices:[[0,1], [1,2]]},
+ {text:"\uD83D\uDE02a\uD83D\uDE02a\uD83D\uDE02", indices:[[2,3], [5,6]], unicode_indices:[[1,2], [3,4]]}
+ ];
+
+ for (var i = 0; i < testCases.length; i++) {
+ entities = [];
+ for (var j = 0; j < testCases[i].indices.length; j++) {
+ entities.push({indices:testCases[i].indices[j]});
+ }
+ twttr.txt.modifyIndicesFromUTF16ToUnicode(testCases[i].text, entities);
+ for (var j = 0; j < testCases[i].indices.length; j++) {
+ same(entities[j].indices, testCases[i].unicode_indices[j], "Convert UTF16 indices to Unicode indices for text '" + testCases[i].text +"'");
+ }
+ twttr.txt.modifyIndicesFromUnicodeToUTF16(testCases[i].text, entities);
+ for (var j = 0; j < testCases[i].indices.length; j++) {
+ same(entities[j].indices, testCases[i].indices[j], "Convert Unicode indices to UTF16 indices for text '" + testCases[i].text +"'");
+ }
+ }
});
test("twttr.txt.autolink", function() {
View
@@ -767,27 +767,51 @@ if (typeof twttr === "undefined" || twttr === null) {
};
twttr.txt.modifyIndicesFromUnicodeToUTF16 = function(text, entities) {
- twttr.txt.shiftIndices(text, entities, 1);
+ twttr.txt.convertUnicodeIndices(text, entities, false);
};
twttr.txt.modifyIndicesFromUTF16ToUnicode = function(text, entities) {
- twttr.txt.shiftIndices(text, entities, -1);
+ twttr.txt.convertUnicodeIndices(text, entities, true);
};
- twttr.txt.shiftIndices = function(text, entities, diff) {
- for (var i = 0; i < text.length - 1; i++) {
- var c1 = text.charCodeAt(i);
- var c2 = text.charCodeAt(i + 1);
- if (0xD800 <= c1 && c1 <= 0xDBFF && 0xDC00 <= c2 && c2 <= 0xDFFF) {
- // supplementary character
- i++; // skip surrogate pair character
- for (var j = 0; j < entities.length; j++) {
- if (entities[j].indices[0] >= i) {
- entities[j].indices[0] += diff;
- entities[j].indices[1] += diff;
- }
+ twttr.txt.convertUnicodeIndices = function(text, entities, indicesInUTF16) {
+ if (entities.length == 0) {
+ return;
+ }
+
+ var charIndex = 0;
+ var codePointIndex = 0;
+
+ // sort entities by start index
+ entities.sort(function(a,b){ return a.indices[0] - b.indices[0]; });
+ var entityIndex = 0;
+ var entity = entities[0];
+
+ while (charIndex < text.length) {
+ if (entity.indices[0] == (indicesInUTF16 ? charIndex : codePointIndex)) {
+ var len = entity.indices[1] - entity.indices[0];
+ entity.indices[0] = indicesInUTF16 ? codePointIndex : charIndex;
+ entity.indices[1] = entity.indices[0] + len;
+
+ entityIndex++;
+ if (entityIndex == entities.length) {
+ // no more entity
+ break;
+ }
+ entity = entities[entityIndex];
+ }
+
+ var c = text.charCodeAt(charIndex);
+ if (0xD800 <= c && c <= 0xDBFF && charIndex < text.length - 1) {
+ // Found high surrogate char
+ c = text.charCodeAt(charIndex + 1);
+ if (0xDC00 <= c && c <= 0xDFFF) {
+ // Found surrogate pair
+ charIndex++;
}
}
+ codePointIndex++;
+ charIndex++;
}
};

0 comments on commit 8bf6b07

Please sign in to comment.