Skip to content

Commit

Permalink
Merge pull request #52 from vmg/vmg/refactor
Browse files Browse the repository at this point in the history
The great UTF8 Refactoring
  • Loading branch information
Vicent Marti committed May 20, 2016
2 parents e1d1a52 + 0a29fcc commit f3e8a28
Show file tree
Hide file tree
Showing 12 changed files with 764 additions and 444 deletions.
19 changes: 0 additions & 19 deletions Rakefile
Expand Up @@ -48,22 +48,3 @@ file package('.gem') => %w[pkg/ rinku.gemspec] + $spec.files do |f|
sh "gem build rinku.gemspec"
mv File.basename(f.name), f.name
end

# GEMSPEC HELPERS ==========================================================
task :gather => 'sundown:checkout' do |t|
files =
FileList[
'sundown/src/{buffer,autolink}.h',
'sundown/src/{buffer,autolink}.c',
]
cp files, 'ext/rinku/',
:preserve => true,
:verbose => true
end

task 'sundown:checkout' do |t|
unless File.exists?('sundown/src/markdown.h')
sh 'git submodule init'
sh 'git submodule update'
end
end
241 changes: 114 additions & 127 deletions ext/rinku/autolink.c
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011, Vicent Marti
* Copyright (c) 2016, GitHub, Inc
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
Expand All @@ -13,21 +13,22 @@
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/

#include "buffer.h"
#include "autolink.h"

#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <stdbool.h>

#include "buffer.h"
#include "autolink.h"
#include "utf8.h"

#if defined(_WIN32)
#define strncasecmp _strnicmp
#endif

int
sd_autolink_issafe(const uint8_t *link, size_t link_len)
bool
autolink_issafe(const uint8_t *link, size_t link_len)
{
static const size_t valid_uris_count = 5;
static const char *valid_uris[] = {
Expand All @@ -41,47 +42,53 @@ sd_autolink_issafe(const uint8_t *link, size_t link_len)

if (link_len > len &&
strncasecmp((char *)link, valid_uris[i], len) == 0 &&
isalnum(link[len]))
return 1;
rinku_isalnum(link[len]))
return true;
}

return 0;
return false;
}

static size_t
autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size)
static bool
autolink_delim(const uint8_t *data, struct autolink_pos *link)
{
uint8_t cclose, copen = 0;
size_t i;

for (i = 0; i < link_end; ++i)
for (i = link->start; i < link->end; ++i)
if (data[i] == '<') {
link_end = i;
link->end = i;
break;
}

while (link_end > 0) {
if (strchr("?!.,:", data[link_end - 1]) != NULL)
link_end--;
while (link->end > link->start) {
if (strchr("?!.,:", data[link->end - 1]) != NULL)
link->end--;

else if (data[link_end - 1] == ';') {
size_t new_end = link_end - 2;
else if (data[link->end - 1] == ';') {
size_t new_end = link->end - 2;

while (new_end > 0 && isalpha(data[new_end]))
while (new_end > 0 && rinku_isalnum(data[new_end]))
new_end--;

if (new_end < link_end - 2 && data[new_end] == '&')
link_end = new_end;
else
link_end--;
if (new_end < link->end - 2) {
if (new_end > 0 && data[new_end] == '#')
new_end--;

if (data[new_end] == '&') {
link->end = new_end;
continue;
}
}
link->end--;
}
else break;
}

if (link_end == 0)
return 0;
if (link->end == link->start)
return false;

cclose = data[link_end - 1];
cclose = data[link->end - 1];

switch (cclose) {
case '"': copen = '"'; break;
Expand All @@ -94,7 +101,7 @@ autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size)
if (copen != 0) {
size_t closing = 0;
size_t opening = 0;
size_t i = 0;
size_t i = link->start;

/* Try to close the final punctuation sign in this same line;
* if we managed to close it outside of the URL, that means that it's
Expand All @@ -116,7 +123,7 @@ autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size)
* => foo http://www.pokemon.com/Pikachu_(Electric)
*/

while (i < link_end) {
while (i < link->end) {
if (data[i] == copen)
opening++;
else if (data[i] == cclose)
Expand All @@ -126,170 +133,150 @@ autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size)
}

if (closing != opening)
link_end--;
link->end--;
}

return link_end;
return true;
}

static size_t
check_domain(uint8_t *data, size_t size, int allow_short)
static bool
check_domain(const uint8_t *data, size_t size,
struct autolink_pos *link, bool allow_short)
{
size_t i, np = 0;

if (!isalnum(data[0]))
return 0;
if (!rinku_isalnum(data[link->start]))
return false;

for (i = 1; i < size - 1; ++i) {
for (i = link->start + 1; i < size - 1; ++i) {
if (data[i] == '.') np++;
else if (!isalnum(data[i]) && data[i] != '-') break;
else if (!rinku_isalnum(data[i]) && data[i] != '-') break;
}

link->end = i;

if (allow_short) {
/* We don't need a valid domain in the strict sense (with
* least one dot; so just make sure it's composed of valid
* domain characters and return the length of the the valid
* sequence. */
return i;
return true;
} else {
/* a valid domain needs to have at least a dot.
* that's as far as we get */
return np ? i : 0;
return (np > 0);
}
}

size_t
sd_autolink__www(
size_t *rewind_p,
struct buf *link,
uint8_t *data,
size_t max_rewind,
bool
autolink__www(
struct autolink_pos *link,
const uint8_t *data,
size_t pos,
size_t size,
unsigned int flags)
{
size_t link_end;

if (max_rewind > 0 && !ispunct(data[-1]) && !isspace(data[-1]))
return 0;

if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0)
return 0;
int32_t boundary;
assert(data[pos] == 'w' || data[pos] == 'W');

link_end = check_domain(data, size, 0);
if ((size - pos) < 4 ||
(data[pos + 1] != 'w' && data[pos + 1] != 'W') ||
(data[pos + 2] != 'w' && data[pos + 2] != 'W') ||
data[pos + 3] != '.')
return false;

if (link_end == 0)
return 0;
boundary = utf8proc_rewind(data, pos);
if (boundary &&
!utf8proc_is_space(boundary) &&
!utf8proc_is_punctuation(boundary))
return false;

while (link_end < size && !isspace(data[link_end]))
link_end++;
link->start = pos;
link->end = 0;

link_end = autolink_delim(data, link_end, max_rewind, size);
if (!check_domain(data, size, link, false))
return false;

if (link_end == 0)
return 0;

bufput(link, data, link_end);
*rewind_p = 0;

return (int)link_end;
link->end = utf8proc_find_space(data, link->end, size);
return autolink_delim(data, link);
}

size_t
sd_autolink__email(
size_t *rewind_p,
struct buf *link,
uint8_t *data,
size_t max_rewind,
bool
autolink__email(
struct autolink_pos *link,
const uint8_t *data,
size_t pos,
size_t size,
unsigned int flags)
{
size_t link_end, rewind;
int nb = 0, np = 0;
assert(data[pos] == '@');

link->start = pos;
link->end = pos;

for (rewind = 0; rewind < max_rewind; ++rewind) {
uint8_t c = data[-rewind - 1];
for (; link->start > 0; link->start--) {
uint8_t c = data[link->start - 1];

if (isalnum(c))
if (rinku_isalnum(c))
continue;

if (strchr(".+-_", c) != NULL)
if (strchr(".+-_%", c) != NULL)
continue;

break;
}

if (rewind == 0)
return 0;
if (link->start == pos)
return false;

for (link_end = 0; link_end < size; ++link_end) {
uint8_t c = data[link_end];
for (; link->end < size; link->end++) {
uint8_t c = data[link->end];

if (isalnum(c))
if (rinku_isalnum(c))
continue;

if (c == '@')
nb++;
else if (c == '.' && link_end < size - 1)
else if (c == '.' && link->end < size - 1)
np++;
else if (c != '-' && c != '_')
break;
}

if (link_end < 2 || nb != 1 || np == 0)
return 0;

link_end = autolink_delim(data, link_end, max_rewind, size);

if (link_end == 0)
return 0;
if ((link->end - pos) < 2 || nb != 1 || np == 0)
return false;

bufput(link, data - rewind, link_end + rewind);
*rewind_p = rewind;

return link_end;
return autolink_delim(data, link);
}

size_t
sd_autolink__url(
size_t *rewind_p,
struct buf *link,
uint8_t *data,
size_t max_rewind,
bool
autolink__url(
struct autolink_pos *link,
const uint8_t *data,
size_t pos,
size_t size,
unsigned int flags)
{
size_t link_end, rewind = 0, domain_len;

if (size < 4 || data[1] != '/' || data[2] != '/')
return 0;

while (rewind < max_rewind && isalpha(data[-rewind - 1]))
rewind++;

if (!sd_autolink_issafe(data - rewind, size + rewind))
return 0;

link_end = strlen("://");
assert(data[pos] == ':');

domain_len = check_domain(
data + link_end,
size - link_end,
flags & SD_AUTOLINK_SHORT_DOMAINS);
if ((size - pos) < 4 || data[pos + 1] != '/' || data[pos + 2] != '/')
return false;

if (domain_len == 0)
return 0;
link->start = pos + 3;
link->end = 0;

link_end += domain_len;
while (link_end < size && !isspace(data[link_end]))
link_end++;
if (!check_domain(data, size, link, flags & AUTOLINK_SHORT_DOMAINS))
return false;

link_end = autolink_delim(data, link_end, max_rewind, size);
link->start = pos;
link->end = utf8proc_find_space(data, link->end, size);

if (link_end == 0)
return 0;
while (link->start && rinku_isalpha(data[link->start - 1]))
link->start--;

bufput(link, data - rewind, link_end + rewind);
*rewind_p = rewind;
if (!autolink_issafe(data + link->start, size - link->start))
return false;

return link_end;
return autolink_delim(data, link);
}

0 comments on commit f3e8a28

Please sign in to comment.