thirdparty: update picohttpparser (#20843)

enghitalo · web-flow · commit 09c35acb091f · 2024-02-16T11:36:22.000+02:00
diff --git a/thirdparty/picohttpparser/src/picohttpparser.c b/thirdparty/picohttpparser/src/picohttpparser.c
@@ -241,6 +241,41 @@ static const char *is_complete(const char *buf, const char *buf_end, size_t last
         *valp_ += res_;                                                                                                            \
     } while (0)
 
+/* returned pointer is always within [buf, buf_end), or null */
+static const char *parse_token(const char *buf, const char *buf_end, const char **token, size_t *token_len, char next_char,
+                               int *ret)
+{
+    /* We use pcmpestri to detect non-token characters. This instruction can take no more than eight character ranges (8*2*8=128
+     * bits that is the size of a SSE register). Due to this restriction, characters `|` and `~` are handled in the slow loop. */
+    static const char ALIGNED(16) ranges[] = "\x00 "  /* control chars and up to SP */
+                                             "\"\""   /* 0x22 */
+                                             "()"     /* 0x28,0x29 */
+                                             ",,"     /* 0x2c */
+                                             "//"     /* 0x2f */
+                                             ":@"     /* 0x3a-0x40 */
+                                             "[]"     /* 0x5b-0x5d */
+                                             "{\xff"; /* 0x7b-0xff */
+    const char *buf_start = buf;
+    int found;
+    buf = findchar_fast(buf, buf_end, ranges, sizeof(ranges) - 1, &found);
+    if (!found) {
+        CHECK_EOF();
+    }
+    while (1) {
+        if (*buf == next_char) {
+            break;
+        } else if (!token_char_map[(unsigned char)*buf]) {
+            *ret = -1;
+            return NULL;
+        }
+        ++buf;
+        CHECK_EOF();
+    }
+    *token = buf_start;
+    *token_len = buf - buf_start;
+    return buf;
+}
+
 /* returned pointer is always within [buf, buf_end), or null */
 static const char *parse_http_version(const char *buf, const char *buf_end, int *minor_version, int *ret)
 {
@@ -280,31 +315,10 @@ static const char *parse_headers(const char *buf, const char *buf_end, struct ph
         if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) {
             /* parsing name, but do not discard SP before colon, see
              * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */
-            headers[*num_headers].name = buf;
-            static const char ALIGNED(16) ranges1[] = "\x00 "  /* control chars and up to SP */
-                                                      "\"\""   /* 0x22 */
-                                                      "()"     /* 0x28,0x29 */
-                                                      ",,"     /* 0x2c */
-                                                      "//"     /* 0x2f */
-                                                      ":@"     /* 0x3a-0x40 */
-                                                      "[]"     /* 0x5b-0x5d */
-                                                      "{\377"; /* 0x7b-0xff */
-            int found;
-            buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
-            if (!found) {
-                CHECK_EOF();
-            }
-            while (1) {
-                if (*buf == ':') {
-                    break;
-                } else if (!token_char_map[(unsigned char)*buf]) {
-                    *ret = -1;
-                    return NULL;
-                }
-                ++buf;
-                CHECK_EOF();
+            if ((buf = parse_token(buf, buf_end, &headers[*num_headers].name, &headers[*num_headers].name_len, ':', ret)) == NULL) {
+                return NULL;
             }
-            if ((headers[*num_headers].name_len = buf - headers[*num_headers].name) == 0) {
+            if (headers[*num_headers].name_len == 0) {
                 *ret = -1;
                 return NULL;
             }
@@ -352,13 +366,17 @@ static const char *parse_request(const char *buf, const char *buf_end, const cha
     }
 
     /* parse request line */
-    ADVANCE_TOKEN(*method, *method_len);
+    if ((buf = parse_token(buf, buf_end, method, method_len, ' ', ret)) == NULL) {
+        return NULL;
+    }
     do {
         ++buf;
+        CHECK_EOF();
     } while (*buf == ' ');
     ADVANCE_TOKEN(*path, *path_len);
     do {
         ++buf;
+        CHECK_EOF();
     } while (*buf == ' ');
     if (*method_len == 0 || *path_len == 0) {
         *ret = -1;
@@ -422,6 +440,7 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
     }
     do {
         ++buf;
+        CHECK_EOF();
     } while (*buf == ' ');
     /* parse status code, we want at least [:digit:][:digit:][:digit:]<other char> to try to parse */
     if (buf_end - buf < 4) {
@@ -430,14 +449,15 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
     }
     PARSE_INT_3(status);
 
-    /* get message includig preceding space */
+    /* get message including preceding space */
     if ((buf = get_token_to_eol(buf, buf_end, msg, msg_len, ret)) == NULL) {
         return NULL;
     }
     if (*msg_len == 0) {
         /* ok */
     } else if (**msg == ' ') {
-        /* remove preceding space */
+        /* Remove preceding space. Successful return from `get_token_to_eol` guarantees that we would hit something other than SP
+         * before running past the end of the given buffer. */
         do {
             ++*msg;
             --*msg_len;
@@ -525,6 +545,8 @@ ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_
     size_t dst = 0, src = 0, bufsz = *_bufsz;
     ssize_t ret = -2; /* incomplete */
 
+    decoder->_total_read += bufsz;
+
     while (1) {
         switch (decoder->_state) {
         case CHUNKED_IN_CHUNK_SIZE:
@@ -537,6 +559,18 @@ ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_
                         ret = -1;
                         goto Exit;
                     }
+                    /* the only characters that may appear after the chunk size are BWS, semicolon, or CRLF */
+                    switch (buf[src]) {
+                    case ' ':
+                    case '\011':
+                    case ';':
+                    case '\012':
+                    case '\015':
+                        break;
+                    default:
+                        ret = -1;
+                        goto Exit;
+                    }
                     break;
                 }
                 if (decoder->_hex_count == sizeof(size_t) * 2) {
@@ -632,6 +666,12 @@ ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_
     if (dst != src)
         memmove(buf + dst, buf + src, bufsz - src);
     *_bufsz = dst;
+    /* if incomplete but the overhead of the chunked encoding is >=100KB and >80%, signal an error */
+    if (ret == -2) {
+        decoder->_total_overhead += bufsz - dst;
+        if (decoder->_total_overhead >= 100 * 1024 && decoder->_total_read - decoder->_total_overhead < decoder->_total_read / 4)
+            ret = -1;
+    }
     return ret;
 }
 
diff --git a/thirdparty/picohttpparser/src/picohttpparser.h b/thirdparty/picohttpparser/src/picohttpparser.h
@@ -27,6 +27,7 @@
 #ifndef picohttpparser_h
 #define picohttpparser_h
 
+#include <stdint.h>
 #include <sys/types.h>
 
 #ifdef _MSC_VER
@@ -39,12 +40,12 @@ extern "C" {
 
 /* contains name and value of a header (name == NULL if is a continuing line
  * of a multiline header */
-typedef struct phr_header {
+struct phr_header {
     const char *name;
     size_t name_len;
     const char *value;
     size_t value_len;
-}phr_header;
+};
 
 /* returns number of bytes consumed if successful, -2 if request is partial,
  * -1 if failed */
@@ -64,6 +65,8 @@ struct phr_chunked_decoder {
     char consume_trailer;       /* if trailing headers should be consumed */
     char _hex_count;
     char _state;
+    uint64_t _total_read;
+    uint64_t _total_overhead;
 };
 
 /* the function rewrites the buffer given as (buf, bufsz) removing the chunked-
@@ -72,8 +75,8 @@ struct phr_chunked_decoder {
  * repeatedly call the function while it returns -2 (incomplete) every time
  * supplying newly arrived data.  If the end of the chunked-encoded data is
  * found, the function returns a non-negative number indicating the number of
- * octets left undecoded at the tail of the supplied buffer.  Returns -1 on
- * error.
+ * octets left undecoded, that starts from the offset returned by `*bufsz`.
+ * Returns -1 on error.
  */
 ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *bufsz);