Skip to content
This repository has been archived by the owner on Sep 18, 2021. It is now read-only.

Commit

Permalink
Merge pull request #45 from twitter/fix_utf16_index
Browse files Browse the repository at this point in the history
Fix a bug in modifyIndicesFromUnicodeToUTF16/modifyIndicesFromUTF16ToUnicode
  • Loading branch information
Keita Fujii committed Feb 15, 2012
2 parents f95f96f + d390ff2 commit 8bf6b07
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 14 deletions.
38 changes: 38 additions & 0 deletions test/tests.js
Expand Up @@ -57,6 +57,44 @@ test("twttr.txt.extract", function() {
same(extracted, [{url:"http://twitter.com", indices:[2, 20]}, {url:"http://test.com", indices:[23, 38]}], "URL w/ Supplementary character, Unicode indices"); same(extracted, [{url:"http://twitter.com", indices:[2, 20]}, {url:"http://test.com", indices:[23, 38]}], "URL w/ Supplementary character, Unicode indices");
twttr.txt.modifyIndicesFromUnicodeToUTF16(text, extracted); twttr.txt.modifyIndicesFromUnicodeToUTF16(text, extracted);
same(extracted, [{url:"http://twitter.com", indices:[3, 21]}, {url:"http://test.com", indices:[25, 40]}], "URL w/ Supplementary character, UTF-16 indices"); same(extracted, [{url:"http://twitter.com", indices:[3, 21]}, {url:"http://test.com", indices:[25, 40]}], "URL w/ Supplementary character, UTF-16 indices");

var testCases = [
{text:"abc", indices:[[0,3]], unicode_indices:[[0,3]]},
{text:"\uD83D\uDE02abc", indices:[[2,5]], unicode_indices:[[1,4]]},
{text:"\uD83D\uDE02abc\uD83D\uDE02", indices:[[2,5]], unicode_indices:[[1,4]]},
{text:"\uD83D\uDE02abc\uD838\uDE02abc", indices:[[2,5], [7,10]], unicode_indices:[[1,4], [5,8]]},
{text:"\uD83D\uDE02abc\uD838\uDE02abc\uD83D\uDE02", indices:[[2,5], [7,10]], unicode_indices:[[1,4], [5,8]]},
{text:"\uD83D\uDE02\uD83D\uDE02abc", indices:[[4,7]], unicode_indices:[[2,5]]},
{text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc", indices:[[6,9]], unicode_indices:[[3,6]]},
{text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02", indices:[[6,9]], unicode_indices:[[3,6]]},
{text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02", indices:[[6,9]], unicode_indices:[[3,6]]},
{text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02", indices:[[8,11], [19,22]], unicode_indices:[[4,7], [11,14]]},
{text:"\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02", indices:[[7,10], [18,21]], unicode_indices:[[4,7], [11,14]]},
{text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uDE02", indices:[[8,11], [19,22]], unicode_indices:[[4,7], [11,14]]},
{text:"\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02abc\uD83D\uDE02\uD83D\uDE02\uD83D\uD83D\uDE02\uDE02", indices:[[8,11], [19,22]], unicode_indices:[[4,7], [11,14]]},
{text:"\uD83Dabc\uD83D", indices:[[1,4]], unicode_indices:[[1,4]]},
{text:"\uDE02abc\uDE02", indices:[[1,4]], unicode_indices:[[1,4]]},
{text:"\uDE02\uDE02abc\uDE02\uDE02", indices:[[2,5]], unicode_indices:[[2,5]]},
{text:"abcabc", indices:[[0,3], [3,6]], unicode_indices:[[0,3], [3,6]]},
{text:"abc\uD83D\uDE02abc", indices:[[0,3], [5,8]], unicode_indices:[[0,3], [4,7]]},
{text:"aa", indices:[[0,1], [1,2]], unicode_indices:[[0,1], [1,2]]},
{text:"\uD83D\uDE02a\uD83D\uDE02a\uD83D\uDE02", indices:[[2,3], [5,6]], unicode_indices:[[1,2], [3,4]]}
];

for (var i = 0; i < testCases.length; i++) {
entities = [];
for (var j = 0; j < testCases[i].indices.length; j++) {
entities.push({indices:testCases[i].indices[j]});
}
twttr.txt.modifyIndicesFromUTF16ToUnicode(testCases[i].text, entities);
for (var j = 0; j < testCases[i].indices.length; j++) {
same(entities[j].indices, testCases[i].unicode_indices[j], "Convert UTF16 indices to Unicode indices for text '" + testCases[i].text +"'");
}
twttr.txt.modifyIndicesFromUnicodeToUTF16(testCases[i].text, entities);
for (var j = 0; j < testCases[i].indices.length; j++) {
same(entities[j].indices, testCases[i].indices[j], "Convert Unicode indices to UTF16 indices for text '" + testCases[i].text +"'");
}
}
}); });


test("twttr.txt.autolink", function() { test("twttr.txt.autolink", function() {
Expand Down
52 changes: 38 additions & 14 deletions twitter-text.js
Expand Up @@ -767,27 +767,51 @@ if (typeof twttr === "undefined" || twttr === null) {
}; };


twttr.txt.modifyIndicesFromUnicodeToUTF16 = function(text, entities) { twttr.txt.modifyIndicesFromUnicodeToUTF16 = function(text, entities) {
twttr.txt.shiftIndices(text, entities, 1); twttr.txt.convertUnicodeIndices(text, entities, false);
}; };


twttr.txt.modifyIndicesFromUTF16ToUnicode = function(text, entities) { twttr.txt.modifyIndicesFromUTF16ToUnicode = function(text, entities) {
twttr.txt.shiftIndices(text, entities, -1); twttr.txt.convertUnicodeIndices(text, entities, true);
}; };


twttr.txt.shiftIndices = function(text, entities, diff) { twttr.txt.convertUnicodeIndices = function(text, entities, indicesInUTF16) {
for (var i = 0; i < text.length - 1; i++) { if (entities.length == 0) {
var c1 = text.charCodeAt(i); return;
var c2 = text.charCodeAt(i + 1); }
if (0xD800 <= c1 && c1 <= 0xDBFF && 0xDC00 <= c2 && c2 <= 0xDFFF) {
// supplementary character var charIndex = 0;
i++; // skip surrogate pair character var codePointIndex = 0;
for (var j = 0; j < entities.length; j++) {
if (entities[j].indices[0] >= i) { // sort entities by start index
entities[j].indices[0] += diff; entities.sort(function(a,b){ return a.indices[0] - b.indices[0]; });
entities[j].indices[1] += diff; var entityIndex = 0;
} var entity = entities[0];

while (charIndex < text.length) {
if (entity.indices[0] == (indicesInUTF16 ? charIndex : codePointIndex)) {
var len = entity.indices[1] - entity.indices[0];
entity.indices[0] = indicesInUTF16 ? codePointIndex : charIndex;
entity.indices[1] = entity.indices[0] + len;

entityIndex++;
if (entityIndex == entities.length) {
// no more entity
break;
}
entity = entities[entityIndex];
}

var c = text.charCodeAt(charIndex);
if (0xD800 <= c && c <= 0xDBFF && charIndex < text.length - 1) {
// Found high surrogate char
c = text.charCodeAt(charIndex + 1);
if (0xDC00 <= c && c <= 0xDFFF) {
// Found surrogate pair
charIndex++;
} }
} }
codePointIndex++;
charIndex++;
} }
}; };


Expand Down

0 comments on commit 8bf6b07

Please sign in to comment.