Skip to content

Commit

Permalink
Encode URLs before fetch. Fixes OpenRefine#6140
Browse files Browse the repository at this point in the history
  • Loading branch information
tfmorris committed Nov 8, 2023
1 parent ac8b9fb commit 726f07b
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,17 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.TimeUnit;

import com.google.refine.operations.OperationDescription;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.core5.http.Header;
import org.apache.hc.core5.http.message.BasicHeader;

Expand Down Expand Up @@ -322,14 +327,15 @@ Serializable cachedFetch(String urlString) {
}

Serializable fetch(String urlString, Header[] headers) {
try { // HttpClients.createDefault()) {
try {
return _httpClient.getAsString(urlString, headers);
} catch (IOException e) {
return _onError == OnError.StoreError ? new EvalError(e) : null;
}
} catch (Exception e) {
return _onError == OnError.StoreError ? new EvalError(e.getMessage()) : null;
try {
// Do a little dance to get all the pieces of the URL correctly encoded, if possible
// TODO: Should we try to undo any existing percent encoding, so we don't end up double encoded?
URL url = new URL(urlString);
URI uri = new URI(url.getProtocol(), url.getUserInfo(), url.getHost(), url.getPort(), url.getPath(),
url.getQuery(), url.getRef());
return _httpClient.getAsString(uri.toASCIIString(), headers);
} catch (URISyntaxException | IOException e) {
return _onError == OnError.StoreError ? new EvalError(e) : null;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import java.util.Random;
import java.util.concurrent.TimeUnit;

import com.google.common.net.UrlEscapers;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
Expand Down Expand Up @@ -168,7 +169,7 @@ public void testUrlCaching() throws Exception {
null);

// We have 100 rows and 500 ms per row but only two distinct
// values so we should not wait much more than ~1000 ms to get the
// values, so we should not wait much more than ~1000 ms to get the
// results.
runAndWait(op, 1500);

Expand All @@ -190,19 +191,24 @@ public void testUrlCaching() throws Exception {
public void testInvalidUrl() throws Exception {
try (MockWebServer server = new MockWebServer()) {
server.start();
HttpUrl url = server.url("/random");
server.enqueue(new MockResponse());
server.enqueue(new MockResponse().setBody("random string"));
server.enqueue(new MockResponse().setBody("Ahmed"));

Row row0 = new Row(2);
row0.setCell(0, new Cell("auinrestrsc", null)); // malformed -> error
project.rows.add(row0);
Row row1 = new Row(2);
row1.setCell(0, new Cell(url.toString(), null)); // fine
row1.setCell(0, new Cell(server.url("/random").toString(), null)); // fine
project.rows.add(row1);
Row row2 = new Row(2);
// well-formed but not resolvable.
row2.setCell(0, new Cell("http://domain.invalid/random", null));
project.rows.add(row2);
Row row3 = new Row(2);
// needs encoding
final String ARABIC = "احمد";
row3.setCell(0, new Cell(server.url("/") + ARABIC, null));
project.rows.add(row3);

EngineDependentOperation op = new ColumnAdditionByFetchingURLsOperation(engine_config,
"fruits",
Expand All @@ -218,9 +224,16 @@ public void testInvalidUrl() throws Exception {

int newCol = project.columnModel.getColumnByName("junk").getCellIndex();
// Inspect rows
Assert.assertTrue(project.rows.get(0).getCellValue(newCol) instanceof EvalError);
Assert.assertNotNull(project.rows.get(1).getCellValue(newCol));
Assert.assertTrue(ExpressionUtils.isError(project.rows.get(2).getCellValue(newCol)));
assertTrue(project.rows.get(0).getCellValue(newCol) instanceof EvalError);
assertEquals(project.rows.get(1).getCellValue(newCol), "random string");
assertTrue(ExpressionUtils.isError(project.rows.get(2).getCellValue(newCol)));
assertEquals(project.rows.get(3).getCellValue(newCol), "Ahmed");

RecordedRequest request1 = server.takeRequest();
assertEquals(request1.getPath(), "/random");
RecordedRequest request2 = server.takeRequest();
// verify that path was correctly escaped before request was sent
assertEquals(request2.getPath().substring(1), UrlEscapers.urlPathSegmentEscaper().escape(ARABIC));
}
}

Expand Down

0 comments on commit 726f07b

Please sign in to comment.