Permalink
Browse files

make scriptfilter work

  • Loading branch information...
1 parent be72d1d commit 457f0ae359ae1db09c4409bfca82119fff5f0f92 @thejh committed Jun 3, 2012
Showing with 134 additions and 18 deletions.
  1. +11 −0 conf/domains.blacklist
  2. +15 −0 conf/script_texts.blacklist
  3. +1 −1 src/Makefile
  4. +43 −2 src/blacklist.c
  5. +1 −1 src/blacklist.h
  6. +63 −14 src/main.c
@@ -1,5 +1,6 @@
b platform.twitter.com
b search.twitter.com
+b p.twitter.com
b facebook.net
b facebook.com
b chartbeat.net
@@ -77,6 +78,16 @@ b ic.tynt.com
b getclicky.com
b googletagservices.com
b maxymiser.com
+b webtrekk.net
+b effectivemeasure.net
+b edgesuite.net
+b gaug.es
# unfortunate... but I can't think of a better way right now
b disqus.com
+
+
+
+f heise.de
+f golem.de
+f gulli.com
@@ -0,0 +1,15 @@
+src="http://rl.heise.de/
+src="http://ad-emea.doubleclick.net/
+img id="avw_pixel_intern" src="/avw-bin/ivw/
+src="http://heise.ivwbox.de/
+src="http://heise.met.vgwort.de/
+document.write("<IMG SRC=\"" + IVW
+document.write("<img src=\""+IVW
+src="http://ad.de.doubleclick.net
+<img src='http://ads.golem.de/
+.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+GS_googleEnableAllServices();
+GA_googleAddSlot
+GA_googleFetchAds()
+GA_googleFillSlot
+var _comscore = _comscore || [];
View
@@ -1,4 +1,4 @@
-LIBS = -lev `pkg-config --libs glib-2.0`
+LIBS = -lev -lz `pkg-config --libs glib-2.0`
FLAGS = -g -Wall -Werror -Wno-strict-aliasing -Wno-unused-variable -Wno-unused-function -std=c99 -Wno-missing-braces `pkg-config --cflags glib-2.0`
OBJS = headers.o memory.o main.o ev_helpers.o outstream.o helpers.o easy_hashtable.o blacklist.o
View
@@ -1,4 +1,5 @@
#define _XOPEN_SOURCE 500
+#define _GNU_SOURCE
#include <stdio.h>
#include <assert.h>
@@ -19,6 +20,9 @@ struct bl_entry {
struct bl_entry *blacklist = NULL;
int blacklist_size = 0;
+char **script_badwords = NULL;
+int script_badwords_size = 0;
+
static struct bl_entry *get_bl_entry(char *host) {
char predotted_bl_entry[1030];
@@ -46,6 +50,7 @@ static struct bl_entry *get_bl_entry_by_url(char *url, int url_size) {
}
void reload_blacklist() {
+ // domains blacklist
FILE *f = fopen("../conf/domains.blacklist", "r");
char line[1024];
blacklist_size = 0;
@@ -63,7 +68,7 @@ void reload_blacklist() {
char *n_i = strchr(line, '\n');
if (r_i != NULL) *r_i = '\0';
if (n_i != NULL) *n_i = '\0';
- blacklist[i++].domain = strdup(line+2);
+ blacklist[i].domain = strdup(line+2);
switch (line[0]) {
case 'b': {
blacklist[i].forbidden = 1;
@@ -76,9 +81,31 @@ void reload_blacklist() {
break;
}
}
+ i++;
}
assert(i == blacklist_size);
fclose(f);
+
+ // script badwords
+ f = fopen("../conf/script_texts.blacklist", "r");
+ script_badwords_size = 0;
+ while (fgets(line, 1024, f) != NULL) {
+ if (strlen(line) < 3) continue;
+ script_badwords_size++;
+ }
+ script_badwords = malloc(sizeof(char *) * script_badwords_size);
+ rewind(f);
+ i = 0;
+ while (fgets(line, 1024, f) != NULL) {
+ if (strlen(line) < 3) continue;
+ char *r_i = strchr(line, '\r');
+ char *n_i = strchr(line, '\n');
+ if (r_i != NULL) *r_i = '\0';
+ if (n_i != NULL) *n_i = '\0';
+ script_badwords[i++] = strdup(line);
+ }
+ assert(i == script_badwords_size);
+ fclose(f);
}
int bl_check(char *host) {
@@ -87,9 +114,23 @@ int bl_check(char *host) {
return e->forbidden;
}
+static char simple_data_filter(char **buf, size_t *buf_len) {
+ for (int i=0; i<script_badwords_size; i++) {
+ if (memmem(*buf, *buf_len, script_badwords[i], strlen(script_badwords[i])) != NULL) {
+ *buf = strdup("</script>");
+ *buf_len = 9;
+ return 0;
+ }
+ }
+ char *newbuf = malloc(*buf_len);
+ memcpy(newbuf, *buf, *buf_len);
+ *buf = newbuf;
+ return 0;
+}
+
data_filter *bl_get_data_filter(char *url, int url_size) {
struct bl_entry *e = get_bl_entry_by_url(url, url_size);
if (e == NULL || e->filter == 0) return NULL;
- YADA return NULL;
+ return simple_data_filter;
}
View
@@ -1,4 +1,4 @@
-typedef char data_filter(char *, int); // FIXME!
+typedef char data_filter(char **buf, size_t *buf_len); // FIXME!
void reload_blacklist();
int bl_check(char *host);
View
@@ -16,6 +16,7 @@
#include <signal.h>
#include <libev/ev.h>
+#include <zlib.h>
#include "../deps/http-parser/http_parser.h"
@@ -86,6 +87,7 @@ enum data_filter_state {
FILTER_INACTIVE_AWAIT_P,
FILTER_INACTIVE_AWAIT_T,
FILTER_INACTIVE_AWAIT_CLOSEBRACKET_OR_SPACE,
+ FILTER_INACTIVE_AWAIT_CLOSEBRACKET,
FILTER_ACTIVE_AWAIT_OPENBRACKET,
FILTER_ACTIVE_AWAIT_SLASH,
FILTER_ACTIVE_AWAIT_S,
@@ -111,6 +113,8 @@ struct http_agent {
data_filter *data_filter;
enum data_filter_state data_filter_state;
unsigned char *data_filter_buffer;
+ int data_filter_buffer_index;
+ int data_filter_buffer_size;
char ungzip_needed;
z_stream ungzipper;
@@ -315,6 +319,9 @@ int on_server_headers_complete(http_parser *p) {
int status_len = asprintf(&status_str, "%i", p->status_code);
a->data_filter = bl_get_data_filter(a->client->url, a->client->url_size);
+ a->data_filter_buffer = 0;
+ a->data_filter_buffer_index = 0;
+ a->data_filter_buffer_size = 0;
if (a->data_filter != NULL) {
a->data_filter_state = FILTER_INACTIVE_AWAIT_OPENBRACKET;
struct http_header *h = a->response_headers;
@@ -343,10 +350,10 @@ int on_server_headers_complete(http_parser *p) {
if (a->data_filter != NULL) {
if (strcasecmp(h->key, "Content-Encoding") == 0 && strcasecmp(h->value, "gzip") == 0) {
a->ungzip_needed = 1;
- assert(inflateInit(&a->ungzipper) == Z_OK);
a->ungzipper.zalloc = Z_NULL;
a->ungzipper.zfree = Z_NULL;
a->ungzipper.opaque = NULL;
+ assert(inflateInit2(&a->ungzipper, 16+MAX_WBITS) == Z_OK);
goto discard_header;
}
}
@@ -400,29 +407,35 @@ int on_server_body(http_parser *p, const char *data, size_t size) {
size_t dsize = size;
if (a->ungzip_needed) {
- assert(a->ungzipper.avail_in == 0);
a->ungzipper.next_in = (unsigned char *) data;
a->ungzipper.avail_in = size;
size_t buffer_size = 2 * size;
- a->avail_out = buffer_size;
+ a->ungzipper.avail_out = buffer_size;
d = malloc(buffer_size);
- a->next_out = (unsigned char *) d;
+ a->ungzipper.next_out = (unsigned char *) d;
while (1) {
int res = inflate(&a->ungzipper, Z_SYNC_FLUSH);
assert(res == Z_OK || res == Z_STREAM_END);
if (a->ungzipper.avail_in == 0) break;
- int written = a->next_out - d;
+ int written = ((char *)a->ungzipper.next_out) - d;
d = realloc(d, buffer_size * 2);
- a->next_out = d + written;
- a->avail_out += buffer_size;
+ a->ungzipper.next_out = (Bytef *) (d + written);
+ a->ungzipper.avail_out += buffer_size;
buffer_size *= 2;
}
- dsize = a->next_out - d;
+ dsize = ((char *)a->ungzipper.next_out) - d;
}
if (a->data_filter) {
for (int i=0; i<dsize; i++) {
char c = d[i];
+ if (a->data_filter_buffer != NULL) {
+ if (a->data_filter_buffer_index == a->data_filter_buffer_size) {
+ a->data_filter_buffer_size *= 2;
+ a->data_filter_buffer = safe_realloc(a->data_filter_buffer, a->data_filter_buffer_size);
+ }
+ a->data_filter_buffer[a->data_filter_buffer_index++] = c;
+ }
switch (a->data_filter_state) {
case FILTER_INACTIVE_AWAIT_OPENBRACKET:
if (c == '<') a->data_filter_state = FILTER_INACTIVE_AWAIT_S;
@@ -452,16 +465,35 @@ int on_server_body(http_parser *p, const char *data, size_t size) {
else a->data_filter_state = FILTER_INACTIVE_AWAIT_OPENBRACKET;
break;
case FILTER_INACTIVE_AWAIT_CLOSEBRACKET_OR_SPACE:
- if (c == '>' || c == ' ') {
- // FIXME XXX TODO
- a->data_filter_buffer =
+ if (c != '>' && c != ' ') {
+ a->data_filter_state = FILTER_INACTIVE_AWAIT_OPENBRACKET;
+ break;
+ }
+ a->data_filter_state = FILTER_INACTIVE_AWAIT_CLOSEBRACKET;
+ // NO `break;`!
+ case FILTER_INACTIVE_AWAIT_CLOSEBRACKET:
+ if (c == '>') {
+ // first send out the chunk we already have
+ char *prescript_d = d;
+ size_t prescript_size = i + 1; // include this char!
+ chunkify(&prescript_d, &prescript_size, 0);
+ if (outstream_send(&a->client->outstream, prescript_d, prescript_size) != 0) return 1;
+
+ // now start buffering the rest
+ assert(a->data_filter_buffer == NULL);
+ a->data_filter_buffer_size = 1024;
+ a->data_filter_buffer = malloc(a->data_filter_buffer_size);
a->data_filter_state = FILTER_ACTIVE_AWAIT_OPENBRACKET;
+ a->data_filter_buffer_index = 0;
}
- else a->data_filter_state = FILTER_INACTIVE_AWAIT_OPENBRACKET;
break;
case FILTER_ACTIVE_AWAIT_OPENBRACKET:
- if (c == '<') a->data_filter_state = FILTER_ACTIVE_AWAIT_S;
+ if (c == '<') a->data_filter_state = FILTER_ACTIVE_AWAIT_SLASH;
+ break;
+ case FILTER_ACTIVE_AWAIT_SLASH:
+ if (c == '/') a->data_filter_state = FILTER_ACTIVE_AWAIT_S;
+ else a->data_filter_state = FILTER_ACTIVE_AWAIT_OPENBRACKET;
break;
case FILTER_ACTIVE_AWAIT_S:
if (c == 's' || c == 'S') a->data_filter_state = FILTER_ACTIVE_AWAIT_C;
@@ -489,13 +521,30 @@ int on_server_body(http_parser *p, const char *data, size_t size) {
break;
case FILTER_ACTIVE_AWAIT_CLOSEBRACKET:
if (c == '>') {
- // FIXME XXX TODO
+ // process the script block and send the result
+ char *script_data = (char *) a->data_filter_buffer;
+ size_t script_data_length = a->data_filter_buffer_index;
+ a->data_filter(&script_data, &script_data_length);
+ if (script_data_length > 0) {
+ chunkify(&script_data, &script_data_length, 1);
+ outstream_send(&a->client->outstream, script_data, script_data_length);
+ } else {
+ free(script_data);
+ }
+ free(a->data_filter_buffer);
+
+ // change the mode back
+ d += i + 1;
+ dsize -= i + 1;
+ i = -1;
+ a->data_filter_buffer = NULL;
a->data_filter_state = FILTER_INACTIVE_AWAIT_OPENBRACKET;
}
else a->data_filter_state = FILTER_ACTIVE_AWAIT_OPENBRACKET;
break;
}
}
+ if (a->data_filter_buffer != NULL) return 0;
}
assert(dsize != 0);

0 comments on commit 457f0ae

Please sign in to comment.