diff --git a/source/net/yacy/http/servlets/UrlProxyServlet.java b/source/net/yacy/http/servlets/UrlProxyServlet.java index 88816659bc..7b3e69dccb 100644 --- a/source/net/yacy/http/servlets/UrlProxyServlet.java +++ b/source/net/yacy/http/servlets/UrlProxyServlet.java @@ -111,181 +111,178 @@ public void service (ServletRequest req, ServletResponse res) throws ServletExce if ("CONNECT".equalsIgnoreCase(request.getMethod())) { return; - } else { - - final Continuation continuation = ContinuationSupport.getContinuation(request); + } + final Continuation continuation = ContinuationSupport.getContinuation(request); - if (!continuation.isInitial()) { - response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial - return; - } - // 2 - get target url - URL proxyurl = null; - String strARGS = request.getQueryString(); - if (strARGS == null) { - response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); - return; - } + if (!continuation.isInitial()) { + response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial + return; + } + // 2 - get target url + URL proxyurl = null; + String strARGS = request.getQueryString(); + if (strARGS == null) { + response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); + return; + } - if (strARGS.startsWith("url=")) { - final String strUrl = strARGS.substring(4); // strip "url=" + if (strARGS.startsWith("url=")) { + final String strUrl = strARGS.substring(4); // strip "url=" - try { - proxyurl = new URL(strUrl); - } catch (final MalformedURLException e) { - proxyurl = new URL(URLDecoder.decode(strUrl, UTF8.charset.name())); + try { + proxyurl = new URL(strUrl); + } catch (final MalformedURLException e) { + proxyurl = new URL(URLDecoder.decode(strUrl, UTF8.charset.name())); - } - } - if (proxyurl == null) { - response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); - return; } + } + if (proxyurl == null) { + response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); + return; + } - String hostwithport = proxyurl.getHost(); - if (proxyurl.getPort() != -1) { - hostwithport += ":" + proxyurl.getPort(); - } - // 4 - get target url - RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request); - yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE); - yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH); - - final HashMap prop = new HashMap(); - prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1); - prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport); - prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20")); - if (proxyurl.getQuery() != null) prop.put(HeaderFramework.CONNECTION_PROP_ARGS, proxyurl.getQuery()); - prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); + String hostwithport = proxyurl.getHost(); + if (proxyurl.getPort() != -1) { + hostwithport += ":" + proxyurl.getPort(); + } + // 4 - get target url + RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request); + yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE); + yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH); + + final HashMap prop = new HashMap(); + prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1); + prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport); + prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20")); + if (proxyurl.getQuery() != null) prop.put(HeaderFramework.CONNECTION_PROP_ARGS, proxyurl.getQuery()); + prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); - yacyRequestHeader.put(HeaderFramework.HOST, hostwithport ); - yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath()); + yacyRequestHeader.put(HeaderFramework.HOST, hostwithport ); + yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath()); - // 4 & 5 get & index target url - final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream(); - HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent); + // 4 & 5 get & index target url + final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream(); + HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent); - // reparse header to extract content-length and mimetype - final ResponseHeader proxyResponseHeader = new ResponseHeader(200); // - InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray()); - String line = readLine(proxyout); - while (line != null && !line.equals("")) { - int p; - if ((p = line.indexOf(':')) >= 0) { - // store a property - proxyResponseHeader.put(line.substring(0, p).trim(), line.substring(p + 1).trim()); - } - line = readLine(proxyout); - } - if (line == null) { - response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing"); - return; + // reparse header to extract content-length and mimetype + final ResponseHeader proxyResponseHeader = new ResponseHeader(200); // + InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray()); + String line = readLine(proxyout); + while (line != null && !line.equals("")) { + int p; + if ((p = line.indexOf(':')) >= 0) { + // store a property + proxyResponseHeader.put(line.substring(0, p).trim(), line.substring(p + 1).trim()); } + line = readLine(proxyout); + } + if (line == null) { + response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing"); + return; + } - if (proxyResponseHeader.containsKey(HeaderFramework.LOCATION)) { - // rewrite location header - String location = proxyResponseHeader.get(HeaderFramework.LOCATION); - if (location.startsWith("http")) { - location = request.getServletPath() + "?url=" + location; - } else { - location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location; - } - response.addHeader(HeaderFramework.LOCATION, location); - } + if (proxyResponseHeader.containsKey(HeaderFramework.LOCATION)) { + // rewrite location header + String location = proxyResponseHeader.get(HeaderFramework.LOCATION); + if (location.startsWith("http")) { + location = request.getServletPath() + "?url=" + location; + } else { + location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location; + } + response.addHeader(HeaderFramework.LOCATION, location); + } - final int httpStatus = proxyResponseHeader.getStatusCode(); - final String mimeType = proxyResponseHeader.getContentType(); - response.setStatus(httpStatus); - response.setContentType(mimeType); - - if ((httpStatus < HttpServletResponse.SC_BAD_REQUEST) && (mimeType != null) && mimeType.startsWith("text")) { - if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { - proxyout = new ChunkedInputStream(proxyout); - } + final int httpStatus = proxyResponseHeader.getStatusCode(); + final String mimeType = proxyResponseHeader.getContentType(); + response.setStatus(httpStatus); + response.setContentType(mimeType); + + if ((httpStatus < HttpServletResponse.SC_BAD_REQUEST) && (mimeType != null) && mimeType.startsWith("text")) { + if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { + proxyout = new ChunkedInputStream(proxyout); + } - // 7 - modify target content - final String servletstub = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getServletPath() + "?url="; - Document doc; - try { - doc = Jsoup.parse(proxyout, UTF8.charset.name(), proxyurl.toString()); - } catch (IOException eio) { - response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy: parser error on " + proxyurl.toString()); - return; - } + // 7 - modify target content + final String servletstub = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getServletPath() + "?url="; + Document doc; + try { + doc = Jsoup.parse(proxyout, UTF8.charset.name(), proxyurl.toString()); + } catch (IOException eio) { + response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy: parser error on " + proxyurl.toString()); + return; + } - Element bde = doc.body(); // start with body element to rewrite href links - // rewrite all href with abs proxy url (must be abs because of head tag - Elements taglist = bde.getElementsByAttribute("href"); - final Switchboard sb = Switchboard.getSwitchboard(); - for (Element e : taglist) { - if (e.tagName().equals("a")) { // get tag - String absurl = e.absUrl("href"); // get href attribut as abs url - if (absurl.startsWith("data:") || absurl.startsWith("#") || absurl.startsWith("mailto:") || absurl.startsWith("javascript:")) { - continue; - } else { - if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { - try { - if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(absurl)) != null) { - continue; - } - } catch (MalformedURLException ex) { - ConcurrentLog.fine("PROXY", "ProxyServlet: malformed url for url-rewirte " + absurl); - continue; - } + Element bde = doc.body(); // start with body element to rewrite href links + // rewrite all href with abs proxy url (must be abs because of head tag + Elements taglist = bde.getElementsByAttribute("href"); + final Switchboard sb = Switchboard.getSwitchboard(); + for (Element e : taglist) { + if (e.tagName().equals("a")) { // get tag + String absurl = e.absUrl("href"); // get href attribut as abs url + if (absurl.startsWith("data:") || absurl.startsWith("#") || absurl.startsWith("mailto:") || absurl.startsWith("javascript:")) { + continue; + } + if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { + try { + if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(absurl)) != null) { + continue; } - e.attr("href", servletstub + absurl); // rewrite with abs proxy-url + } catch (MalformedURLException ex) { + ConcurrentLog.fine("PROXY", "ProxyServlet: malformed url for url-rewirte " + absurl); + continue; } } + e.attr("href", servletstub + absurl); // rewrite with abs proxy-url } + } - Element hd = doc.head(); - if (hd != null) { - // add a base url if not exist (to make sure relative links point to original) - Elements basetags = hd.getElementsByTag("base"); - if (basetags.isEmpty()) { - Element newbasetag = hd.prependElement("base"); - String basestr = proxyurl.getProtocol() + "://" + hostwithport + proxyurl.getPath(); //+directory; - newbasetag.attr("href", basestr); - } + Element hd = doc.head(); + if (hd != null) { + // add a base url if not exist (to make sure relative links point to original) + Elements basetags = hd.getElementsByTag("base"); + if (basetags.isEmpty()) { + Element newbasetag = hd.prependElement("base"); + String basestr = proxyurl.getProtocol() + "://" + hostwithport + proxyurl.getPath(); //+directory; + newbasetag.attr("href", basestr); } - - // 8 - add interaction elements (e.g. proxy exit button to switch back to original url) - // TODO: use a template file for - if (_stopProxyText != null) { - bde.prepend("
" - + "
"); } - // 9 - deliver to client - byte[] sbb = UTF8.getBytes(doc.toString()); + // 8 - add interaction elements (e.g. proxy exit button to switch back to original url) + // TODO: use a template file for + if (_stopProxyText != null) { + bde.prepend("
" + + "
"); + } - // add some proxy-headers to response header - if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) { - response.setHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER)); - } - if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) { - response.setHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE)); - } - if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { - response.setHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED)); - } - if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) { - response.setHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES)); - } + // 9 - deliver to client + byte[] sbb = UTF8.getBytes(doc.toString()); + + // add some proxy-headers to response header + if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) { + response.setHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) { + response.setHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { + response.setHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) { + response.setHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES)); + } - response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length); - response.getOutputStream().write(sbb); + response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length); + response.getOutputStream().write(sbb); - } else { - if (httpStatus >= HttpServletResponse.SC_BAD_REQUEST) { - response.sendError(httpStatus,"Site " + proxyurl + " returned with status"); - return; - } - if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) { - response.setHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)); - } - FileUtils.copy(proxyout, response.getOutputStream()); + } else { + if (httpStatus >= HttpServletResponse.SC_BAD_REQUEST) { + response.sendError(httpStatus,"Site " + proxyurl + " returned with status"); + return; + } + if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) { + response.setHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)); } + FileUtils.copy(proxyout, response.getOutputStream()); } } diff --git a/source/net/yacy/http/servlets/YaCyProxyServlet.java b/source/net/yacy/http/servlets/YaCyProxyServlet.java index de64b8f191..a2cd87283d 100644 --- a/source/net/yacy/http/servlets/YaCyProxyServlet.java +++ b/source/net/yacy/http/servlets/YaCyProxyServlet.java @@ -81,207 +81,205 @@ public void service (ServletRequest req, ServletResponse res) throws ServletExce if ("CONNECT".equalsIgnoreCase(request.getMethod())) { return; - } else { - - final Continuation continuation = ContinuationSupport.getContinuation(request); + } + final Continuation continuation = ContinuationSupport.getContinuation(request); - if (!continuation.isInitial()) { - response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial - return; - } - URL proxyurl = null; - String strARGS = request.getQueryString(); - if (strARGS == null) { - response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); - return; - } + if (!continuation.isInitial()) { + response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial + return; + } + URL proxyurl = null; + String strARGS = request.getQueryString(); + if (strARGS == null) { + response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); + return; + } - if (strARGS.startsWith("url=")) { - final String strUrl = strARGS.substring(4); // strip "url=" + if (strARGS.startsWith("url=")) { + final String strUrl = strARGS.substring(4); // strip "url=" - try { - proxyurl = new URL(strUrl); - } catch (final MalformedURLException e) { - proxyurl = new URL(URLDecoder.decode(strUrl, UTF8.charset.name())); + try { + proxyurl = new URL(strUrl); + } catch (final MalformedURLException e) { + proxyurl = new URL(URLDecoder.decode(strUrl, UTF8.charset.name())); - } - } - if (proxyurl == null) { - response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); - return; } + } + if (proxyurl == null) { + response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); + return; + } - String hostwithport = proxyurl.getHost(); - if (proxyurl.getPort() != -1) { - hostwithport += ":" + proxyurl.getPort(); - } - RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request); - yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE); - yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH); - - final HashMap prop = new HashMap(); - prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1); - prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport); - prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20")); - prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); + String hostwithport = proxyurl.getHost(); + if (proxyurl.getPort() != -1) { + hostwithport += ":" + proxyurl.getPort(); + } + RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request); + yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE); + yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH); + + final HashMap prop = new HashMap(); + prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1); + prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport); + prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20")); + prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); - yacyRequestHeader.put(HeaderFramework.HOST, hostwithport ); - yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath()); + yacyRequestHeader.put(HeaderFramework.HOST, hostwithport ); + yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath()); - final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream(); - HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent); - - // reparse header to extract content-length and mimetype - final ResponseHeader proxyResponseHeader = new ResponseHeader(200); // - final InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray()); - String line = readLine(proxyout); - while (line != null && !line.equals("")) { - int p; - if ((p = line.indexOf(':')) >= 0) { - // store a property - proxyResponseHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim()); - } - line = readLine(proxyout); - } - if (line == null) { - response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing"); - return; + final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream(); + HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent); + + // reparse header to extract content-length and mimetype + final ResponseHeader proxyResponseHeader = new ResponseHeader(200); // + final InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray()); + String line = readLine(proxyout); + while (line != null && !line.equals("")) { + int p; + if ((p = line.indexOf(':')) >= 0) { + // store a property + proxyResponseHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim()); } + line = readLine(proxyout); + } + if (line == null) { + response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing"); + return; + } - final int httpStatus = Integer.parseInt((String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS)); + final int httpStatus = Integer.parseInt((String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS)); - String directory = ""; - if (proxyurl.getPath().lastIndexOf('/') > 0) { - directory = proxyurl.getPath().substring(0, proxyurl.getPath().lastIndexOf('/')); - } - - if (response.getHeader(HeaderFramework.LOCATION) != null) { - // rewrite location header - String location = response.getHeader(HeaderFramework.LOCATION); - if (location.startsWith("http")) { - location = request.getServletPath() + "?url=" + location; - } else { - location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location; - } - response.addHeader(HeaderFramework.LOCATION, location); + String directory = ""; + if (proxyurl.getPath().lastIndexOf('/') > 0) { + directory = proxyurl.getPath().substring(0, proxyurl.getPath().lastIndexOf('/')); + } + + if (response.getHeader(HeaderFramework.LOCATION) != null) { + // rewrite location header + String location = response.getHeader(HeaderFramework.LOCATION); + if (location.startsWith("http")) { + location = request.getServletPath() + "?url=" + location; + } else { + location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location; } + response.addHeader(HeaderFramework.LOCATION, location); + } - final String mimeType = proxyResponseHeader.getContentType(); - response.setContentType(mimeType); - response.setStatus(httpStatus); - - if ((mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) { - final StringWriter buffer = new StringWriter(); - - if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { - FileUtils.copy(new ChunkedInputStream(proxyout), buffer, UTF8.charset); - } else { - FileUtils.copy(proxyout, buffer, UTF8.charset); - } - final String sbuffer = buffer.toString(); - - final Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)"); - final Matcher m = p.matcher(sbuffer); - final StringBuffer result = new StringBuffer(80); - final Switchboard sb = Switchboard.getSwitchboard(); - final String servletstub = request.getServletPath()+"?url="; - while (m.find()) { - String init = null; - if (m.group(1) != null) { init = m.group(1); } - if (m.group(3) != null) { init = m.group(3); } - if (m.group(5) != null) { init = m.group(5); } - if (m.group(7) != null) { init = m.group(7); } - if (m.group(9) != null) { init = m.group(9); } - String url = null; - if (m.group(2) != null) { url = m.group(2); } - if (m.group(4) != null) { url = m.group(4); } - if (m.group(6) != null) { url = m.group(6); } - if (m.group(8) != null) { url = m.group(8); } - if (m.group(10) != null) { url = m.group(10); } - if (url.startsWith("data:") || url.startsWith("#") || url.startsWith("mailto:") || url.startsWith("javascript:")) { - String newurl = init + url; - newurl = newurl.replaceAll("\\$", "\\\\\\$"); - m.appendReplacement(result, newurl); + final String mimeType = proxyResponseHeader.getContentType(); + response.setContentType(mimeType); + response.setStatus(httpStatus); + + if ((mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) { + final StringWriter buffer = new StringWriter(); - } else if (url.startsWith("http")) { - // absoulte url of form href="http://domain.com/path" - if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { - try { - if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(url)) != null) { - continue; - } - } catch (final MalformedURLException e) { - ConcurrentLog.fine("PROXY","ProxyServlet: malformed url for url-rewirte " + url); + if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { + FileUtils.copy(new ChunkedInputStream(proxyout), buffer, UTF8.charset); + } else { + FileUtils.copy(proxyout, buffer, UTF8.charset); + } + final String sbuffer = buffer.toString(); + + final Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)"); + final Matcher m = p.matcher(sbuffer); + final StringBuffer result = new StringBuffer(80); + final Switchboard sb = Switchboard.getSwitchboard(); + final String servletstub = request.getServletPath()+"?url="; + while (m.find()) { + String init = null; + if (m.group(1) != null) { init = m.group(1); } + if (m.group(3) != null) { init = m.group(3); } + if (m.group(5) != null) { init = m.group(5); } + if (m.group(7) != null) { init = m.group(7); } + if (m.group(9) != null) { init = m.group(9); } + String url = null; + if (m.group(2) != null) { url = m.group(2); } + if (m.group(4) != null) { url = m.group(4); } + if (m.group(6) != null) { url = m.group(6); } + if (m.group(8) != null) { url = m.group(8); } + if (m.group(10) != null) { url = m.group(10); } + if (url.startsWith("data:") || url.startsWith("#") || url.startsWith("mailto:") || url.startsWith("javascript:")) { + String newurl = init + url; + newurl = newurl.replaceAll("\\$", "\\\\\\$"); + m.appendReplacement(result, newurl); + + } else if (url.startsWith("http")) { + // absoulte url of form href="http://domain.com/path" + if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { + try { + if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(url)) != null) { continue; } + } catch (final MalformedURLException e) { + ConcurrentLog.fine("PROXY","ProxyServlet: malformed url for url-rewirte " + url); + continue; } + } - String newurl = init + servletstub + url; - newurl = newurl.replaceAll("\\$", "\\\\\\$"); - m.appendReplacement(result, newurl); + String newurl = init + servletstub + url; + newurl = newurl.replaceAll("\\$", "\\\\\\$"); + m.appendReplacement(result, newurl); - } else if (url.startsWith("//")) { - // absoulte url but same protocol of form href="//domain.com/path" - final String complete_url = proxyurl.getProtocol() + ":" + url; - if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { - try { - if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(complete_url)) != null) { - continue; - } - } catch (MalformedURLException ex) { - ConcurrentLog.fine("PROXY","ProxyServlet: malformed url for url-rewirte " + complete_url); - continue; - } + } else if (url.startsWith("//")) { + // absoulte url but same protocol of form href="//domain.com/path" + final String complete_url = proxyurl.getProtocol() + ":" + url; + if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { + try { + if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(complete_url)) != null) { + continue; + } + } catch (MalformedURLException ex) { + ConcurrentLog.fine("PROXY","ProxyServlet: malformed url for url-rewirte " + complete_url); + continue; } + } - String newurl = init + servletstub + complete_url; - newurl = newurl.replaceAll("\\$", "\\\\\\$"); - m.appendReplacement(result, newurl); + String newurl = init + servletstub + complete_url; + newurl = newurl.replaceAll("\\$", "\\\\\\$"); + m.appendReplacement(result, newurl); + + } else if (url.startsWith("/")) { + // absolute path of form href="/absolute/path/to/linked/page" + String newurl = init + servletstub + "http://" + hostwithport + url; + newurl = newurl.replaceAll("\\$", "\\\\\\$"); + m.appendReplacement(result, newurl); - } else if (url.startsWith("/")) { - // absolute path of form href="/absolute/path/to/linked/page" - String newurl = init + servletstub + "http://" + hostwithport + url; + } else { + // relative path of form href="relative/path" + try { + MultiProtocolURL target = new MultiProtocolURL("http://" + hostwithport + directory + "/" + url); + String newurl = init + servletstub + target.toString(); newurl = newurl.replaceAll("\\$", "\\\\\\$"); m.appendReplacement(result, newurl); - - } else { - // relative path of form href="relative/path" - try { - MultiProtocolURL target = new MultiProtocolURL("http://" + hostwithport + directory + "/" + url); - String newurl = init + servletstub + target.toString(); - newurl = newurl.replaceAll("\\$", "\\\\\\$"); - m.appendReplacement(result, newurl); - } catch (final MalformedURLException e) {} - } + } catch (final MalformedURLException e) {} } - m.appendTail(result); - - byte[] sbb = UTF8.getBytes(result.toString()); + } + m.appendTail(result); + + byte[] sbb = UTF8.getBytes(result.toString()); - // add some proxy-headers to response header - response.setContentType(proxyResponseHeader.getContentType()); - if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) { - response.addHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER)); - } - if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) { - response.addHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE)); - } - if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { - response.addHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED)); - } - if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) { - response.addHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES)); - } - - response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length); - response.getOutputStream().write(sbb); - - } else { - if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) { - response.addHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)); - } - FileUtils.copy(proxyout, response.getOutputStream()); + // add some proxy-headers to response header + response.setContentType(proxyResponseHeader.getContentType()); + if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) { + response.addHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER)); } + if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) { + response.addHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { + response.addHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) { + response.addHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES)); + } + + response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length); + response.getOutputStream().write(sbb); + + } else { + if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) { + response.addHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)); + } + FileUtils.copy(proxyout, response.getOutputStream()); } }