Skip to content

Loading…

add string encoding support for 1.9 #1

Closed
wants to merge 6 commits into from

4 participants

@brianmario

The code could stand to be cleaned up in a few places, but the functionality is there

@igrigorik

Silly question.. Any reason why this wasn't merged 2 years ago? :)

@thbar

Hello! Is there still a plan to merge this?

@thbar

Somehow related, in HTTPClient which handles this automatically (in case it helps the poor soul coming up here):

@sferik
Collaborator

This code no longer merges into master. If somebody rebases this branch, I will gladly merge it.

@brianmario

I'm cleaning up some old pulls, going to close this one out. Someone could use this as a starting point to add encoding support against master though...

@brianmario brianmario closed this
@brianmario brianmario deleted the brianmario:string-encoding branch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Oct 21, 2010
  1. @brianmario
  2. @brianmario

    get rid of compiler warning

    brianmario committed
  3. @brianmario
  4. @brianmario

    add support for string encodings in 1.9.2, respecting Encoding.defaul…

    brianmario committed
    …t_internal if it's set. For the body, we try and find the encoding in the Content-Type header and use that if we were able to find a matching ruby string encoding
  5. @brianmario

    update comment for strnstr

    brianmario committed
  6. @brianmario
Showing with 167 additions and 11 deletions.
  1. +2 −0 ext/ruby_http_parser/extconf.rb
  2. +98 −9 ext/ruby_http_parser/ruby_http_parser.c
  3. +67 −2 spec/parser_spec.rb
View
2 ext/ruby_http_parser/extconf.rb
@@ -2,6 +2,8 @@
http_parser_dir = File.expand_path('../vendor/http-parser', __FILE__)
$CFLAGS << " -I#{http_parser_dir} "
+$CFLAGS << "-ggdb3 -O0 -Wall -Wextra" if ENV['DEBUG']
+have_func("strnstr")
dir_config("ruby_http_parser")
create_makefile("ruby_http_parser")
View
107 ext/ruby_http_parser/ruby_http_parser.c
@@ -2,14 +2,67 @@
#include "ext_help.h"
#include "http_parser.c"
+/*
+ * Find the first occurrence of find in s, where the search is limited to the
+ * first slen characters of s.
+ *
+ * NOTE: we use strnstr below for header matching so we don't need to
+ * memcpy then memcmp from the input buffer
+ */
+#ifndef HAVE_STRNSTR
+static char *strnstr(const char *s, const char *find, size_t slen) {
+ char c, sc;
+ size_t len;
+
+ if ((c = *find++) != '\0') {
+ len = strlen(find);
+ do {
+ do {
+ if (slen-- < 1 || (sc = *s++) == '\0')
+ return (NULL);
+ } while (sc != c);
+ if (len > slen)
+ return (NULL);
+ } while (strncmp(s, find, len) != 0);
+ s--;
+ }
+ return ((char *)s);
+}
+#endif
+
#define GET_WRAPPER(N, from) ParserWrapper *N = (ParserWrapper *)(from)->data;
+
+#ifdef HAVE_RUBY_ENCODING_H
+#include <ruby/encoding.h>
+#define STR_NEW(str, len, rb_enc) \
+ ({ \
+ VALUE _string; \
+ rb_encoding *internal_encoding = rb_default_internal_encoding(); \
+ if(rb_enc) { \
+ _string = rb_enc_str_new(str, len, rb_enc); \
+ } else { \
+ _string = rb_str_new(str, len); \
+ } \
+ if(internal_encoding) { \
+ _string = rb_str_export_to_enc(_string, internal_encoding); \
+ } \
+ _string; \
+ })
+
+#else
+
+#define STR_NEW(str, len, rb_enc) \
+ rb_str_new(str, len)
+
+#endif
+
#define HASH_CAT(h, k, ptr, len) \
do { \
VALUE __v = rb_hash_aref(h, k); \
if (__v != Qnil) { \
rb_str_cat(__v, ptr, len); \
} else { \
- rb_hash_aset(h, k, rb_str_new(ptr, len)); \
+ rb_hash_aset(h, k, STR_NEW(ptr, len, 0)); \
} \
} while(0)
@@ -29,6 +82,9 @@ typedef struct ParserWrapper {
VALUE on_message_complete;
VALUE last_field_name;
+#ifdef HAVE_RUBY_ENCODING_H
+ rb_encoding *body_encoding;
+#endif
const char *last_field_name_at;
size_t last_field_name_length;
@@ -91,10 +147,10 @@ static VALUE sCall;
int on_message_begin(http_parser *parser) {
GET_WRAPPER(wrapper, parser);
- wrapper->request_url = rb_str_new2("");
- wrapper->request_path = rb_str_new2("");
- wrapper->query_string = rb_str_new2("");
- wrapper->fragment = rb_str_new2("");
+ wrapper->request_url = STR_NEW("", 0, 0);
+ wrapper->request_path = STR_NEW("", 0, 0);
+ wrapper->query_string = STR_NEW("", 0, 0);
+ wrapper->fragment = STR_NEW("", 0, 0);
wrapper->headers = rb_hash_new();
if (wrapper->on_message_begin != Qnil) {
@@ -147,7 +203,27 @@ int on_header_value(http_parser *parser, const char *at, size_t length) {
GET_WRAPPER(wrapper, parser);
if (wrapper->last_field_name == Qnil) {
- wrapper->last_field_name = rb_str_new(wrapper->last_field_name_at, wrapper->last_field_name_length);
+#ifdef HAVE_RUBY_ENCODING_H
+ rb_encoding *rb_enc = NULL;
+ char *enc = NULL;
+ char *found = strnstr(wrapper->last_field_name_at, "Content-Type", wrapper->last_field_name_length);
+ if(found) {
+ enc = strnstr(at, "charset=", length);
+ if (enc) {
+ enc += sizeof("charset=")-1;
+ size_t enc_len = (size_t)length-(enc-at);
+ char encoding[enc_len+1];
+ memcpy(encoding, enc, enc_len);
+ encoding[enc_len] = 0;
+ rb_enc = rb_enc_find(encoding);
+ if (rb_enc) {
+ wrapper->body_encoding = rb_enc;
+ }
+ }
+ }
+#endif
+
+ wrapper->last_field_name = STR_NEW(wrapper->last_field_name_at, wrapper->last_field_name_length, 0);
wrapper->last_field_name_at = NULL;
wrapper->last_field_name_length = 0;
}
@@ -171,7 +247,17 @@ int on_body(http_parser *parser, const char *at, size_t length) {
GET_WRAPPER(wrapper, parser);
if (wrapper->on_body != Qnil) {
- rb_funcall(wrapper->on_body, sCall, 1, rb_str_new(at, length));
+ VALUE body_chunk = Qnil;
+#ifdef HAVE_RUBY_ENCODING_H
+ if (wrapper->body_encoding) {
+ body_chunk = STR_NEW(at, length, wrapper->body_encoding);
+ } else {
+ body_chunk = STR_NEW(at, length, 0);
+ }
+#else
+ body_chunk = STR_NEW(at, length, 0);
+#endif
+ rb_funcall(wrapper->on_body, sCall, 1, body_chunk);
}
return 0;
@@ -204,6 +290,9 @@ VALUE Parser_alloc_by_type(VALUE klass, enum http_parser_type type) {
ParserWrapper *wrapper = ALLOC_N(ParserWrapper, 1);
wrapper->type = type;
wrapper->parser.data = wrapper;
+#ifdef HAVE_RUBY_ENCODING_H
+ wrapper->body_encoding = NULL;
+#endif
ParserWrapper_init(wrapper);
@@ -225,7 +314,7 @@ VALUE ResponseParser_alloc(VALUE klass) {
VALUE Parser_execute(VALUE self, VALUE data) {
ParserWrapper *wrapper = NULL;
char *ptr = RSTRING_PTR(data);
- long len = RSTRING_LEN(data);
+ unsigned long len = RSTRING_LEN(data);
DATA_GET(self, ParserWrapper, wrapper);
@@ -312,7 +401,7 @@ VALUE Parser_http_method(VALUE self) {
DATA_GET(self, ParserWrapper, wrapper);
if (wrapper->parser.type == HTTP_REQUEST)
- return rb_str_new2(http_method_str(wrapper->parser.method));
+ return STR_NEW(http_method_str(wrapper->parser.method), strlen(http_method_str(wrapper->parser.method)), 0);
else
return Qnil;
}
View
69 spec/parser_spec.rb
@@ -2,27 +2,37 @@
require "json"
describe HTTP::Parser do
- before do
+ before(:each) do
@parser = HTTP::Parser.new
@headers = nil
@body = ""
+ @encoding = 'utf-8'
@started = false
@done = false
@parser.on_message_begin = proc{ @started = true }
@parser.on_headers_complete = proc { |e| @headers = e }
- @parser.on_body = proc { |chunk| @body << chunk }
+ @parser.on_body = proc { |chunk|
+ @body << chunk
+ }
@parser.on_message_complete = proc{ @done = true }
end
it "should implement basic api" do
+ if defined? Encoding
+ orig_internal_enc = Encoding.default_internal
+ Encoding.default_internal = nil
+ @rb_encoding = Encoding.find(@encoding)
+ @body.encode!(@rb_encoding)
+ end
@parser <<
"GET /test?ok=1 HTTP/1.1\r\n" +
"User-Agent: curl/7.18.0\r\n" +
"Host: 0.0.0.0:5000\r\n" +
"Accept: */*\r\n" +
"Content-Length: 5\r\n" +
+ "Content-Type: text/plain; encoding=#{@encoding}\r\n" +
"\r\n" +
"World"
@@ -45,6 +55,61 @@
@parser.headers['Host'].should == '0.0.0.0:5000'
@body.should == "World"
+ if defined? Encoding
+ @body.encoding.should eql(@rb_encoding)
+ Encoding.default_internal = orig_internal_enc
+ end
+ end
+
+ if defined? Encoding
+ it "should implement basic api respecting Encoding.default_internal" do
+ orig_internal_enc = Encoding.default_internal
+ enc = 'Windows-1256'
+ rb_enc = Encoding.find enc
+ Encoding.default_internal = rb_enc
+ @body.encode!(rb_enc)
+
+ @parser <<
+ "GET /test?ok=1 HTTP/1.1\r\n" +
+ "User-Agent: curl/7.18.0\r\n" +
+ "Host: 0.0.0.0:5000\r\n" +
+ "Accept: */*\r\n" +
+ "Content-Length: 5\r\n" +
+ "Content-Type: text/plain; encoding=#{@encoding}\r\n" +
+ "\r\n" +
+ "World"
+
+ @started.should be_true
+ @done.should be_true
+
+ @parser.http_major.should == 1
+ @parser.http_minor.should == 1
+ @parser.http_version.should == [1,1]
+ @parser.http_method.should == 'GET'
+ @parser.http_method.encoding.should eql(rb_enc)
+ @parser.status_code.should be_nil
+
+ @parser.request_url.should == '/test?ok=1'
+ @parser.request_url.encoding.should eql(rb_enc)
+ @parser.request_path.should == '/test'
+ @parser.request_path.encoding.should eql(rb_enc)
+ @parser.query_string.should == 'ok=1'
+ @parser.query_string.encoding.should eql(rb_enc)
+ @parser.fragment.should be_empty
+
+ @parser.headers.should == @headers
+ @parser.headers.keys.each do |key|
+ key.encoding.should eql(rb_enc)
+ end
+ @parser.headers['User-Agent'].should == 'curl/7.18.0'
+ @parser.headers['User-Agent'].encoding.should eql(rb_enc)
+ @parser.headers['Host'].should == '0.0.0.0:5000'
+ @parser.headers['Host'].encoding.should eql(rb_enc)
+
+ @body.should == "World"
+ @body.encoding.should eql(rb_enc)
+ Encoding.default_internal = orig_internal_enc
+ end
end
%w[ request response ].each do |type|
Something went wrong with that request. Please try again.