Skip to content

Commit

Permalink
[util] Fix parsing of URI with percent-encoded spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
valfirst committed May 31, 2024
1 parent a38937f commit 108f14a
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ class HeadlessCrawlerTableTransformerTests

private static final String CRAWLING_RELATIVE_URL = "/page";

private static final String OUTGOING_ABSOLUT_URL = "http://some.url/path";
private static final String OUTGOING_RELATIVE_URL = "/path";
private static final String OUTGOING_ABSOLUTE_URL = MAIN_APP_PAGE + OUTGOING_RELATIVE_URL;

private static final String EXCLUDE_EXTENSIONS_REGEX = "js|css";
private static final String EXCLUDE_URLS_REGEX = ".*broken-link*";
Expand Down Expand Up @@ -129,7 +130,7 @@ void testFetchUrlsSuccessfully(String mainAppPageRelativeUrl, Set<String> seedRe
transformer.setExcludeUrlsRegex(EXCLUDE_URLS_REGEX);
transformer.setMainPageUrlProperty(MAIN_APP_PROP);
Set<String> urls = testFetchUrls(mainAppPageRelativeUrl, expectedSeedRelativeUrls);
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUTE_URL)));
verifyNoInteractions(redirectsProvider);
assertThat(logger.getLoggingEvents(), is(List.of(getMainAppPageWarn())));
}
Expand All @@ -153,7 +154,7 @@ void shouldFilterUrlsWhenLastRedirectUrlAlreadyInTheSet() throws IOException, In
transformer.setFilterRedirects(true);
transformer.setSeedRelativeUrls(toSet(PATH2, PATH3));
transformer.setMainPageUrlProperty(MAIN_APP_PROP);
URI outgoingURI = URI.create(OUTGOING_ABSOLUT_URL);
URI outgoingURI = URI.create(OUTGOING_ABSOLUTE_URL);
when(redirectsProvider.getRedirects(outgoingURI)).thenReturn(List.of(outgoingURI));
Set<String> urls = testFetchUrls(ROOT, asList(PATH2, SLASH_PATH3));
assertThat(urls, equalTo(Set.of()));
Expand All @@ -167,11 +168,11 @@ void shouldTreatInvalidStatusCodeAsNoRedirects() throws IOException, Interrupted
transformer.setFilterRedirects(true);
transformer.setSeedRelativeUrls(toSet(PATH2, PATH3));
transformer.setMainPageUrlProperty(MAIN_APP_PROP);
URI outgoingURI = URI.create(OUTGOING_ABSOLUT_URL);
URI outgoingURI = URI.create(OUTGOING_ABSOLUTE_URL);
var httpResponseException = new HttpResponseException(HttpStatus.SC_NOT_FOUND, "");
when(redirectsProvider.getRedirects(outgoingURI)).thenThrow(httpResponseException);
Set<String> urls = testFetchUrls(ROOT, List.of(PATH2, SLASH_PATH3));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUTE_URL)));
assertThat(logger.getLoggingEvents(), is(List.of(getMainAppPageWarn(), warn(httpResponseException,
"Exception during redirects receiving"))));
}
Expand All @@ -182,10 +183,10 @@ void shouldNotFilterUrlsWhenLastRedirectUrlNotInTheSet() throws IOException, Int
transformer.setFilterRedirects(true);
transformer.setSeedRelativeUrls(toSet(PATH2, PATH3));
transformer.setMainPageUrlProperty(MAIN_APP_PROP);
URI outgoingURI = URI.create(OUTGOING_ABSOLUT_URL);
URI outgoingURI = URI.create(OUTGOING_ABSOLUTE_URL);
when(redirectsProvider.getRedirects(outgoingURI)).thenReturn(List.of(URI.create("http://some.url/other")));
Set<String> urls = testFetchUrls(ROOT, asList(PATH2, SLASH_PATH3));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUTE_URL)));
assertThat(logger.getLoggingEvents(), is(List.of(getMainAppPageWarn(),
info(REDIRECT_FILTER_LOG, System.lineSeparator(), "http://some.url/path -> http://some.url/other"))));
}
Expand All @@ -195,7 +196,7 @@ void testFetchUrlsTwice() throws IOException, InterruptedException
{
transformer.setSeedRelativeUrls(toSet(SEED));
Set<String> urls = testFetchUrls(DEFAULT_RELATIVE_URL, List.of(SEED));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUTE_URL)));
TableProperties tableProperties = buildTableProperties();
Set<String> urls2 = transformer.fetchUrls(tableProperties);
verifyNoMoreInteractions(crawlControllerFactory);
Expand All @@ -214,10 +215,10 @@ void testFetchUrlsTwiceWithSameProperties() throws IOException, InterruptedExcep
transformer.setSeedRelativeUrls(toSet(seedRelativeUrlsProperty));
Set<String> urls = runUrlFetching(mainAppPage, tableProperties,
List.of(seedRelativeUrlsProperty), crawlController, ordered);
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUTE_URL)));
Set<String> urls2 = transformer.fetchUrls(tableProperties);
verifyNoMoreInteractions(crawlControllerFactory, crawlController);
assertThat(urls2, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
assertThat(urls2, equalTo(Set.of(OUTGOING_ABSOLUTE_URL)));
assertSame(urls, urls2);
verifyNoInteractions(redirectsProvider);
}
Expand All @@ -228,7 +229,7 @@ void testFetchUrlsWhenSeedRelativeUrlsAreSetViaConfiguration() throws IOExceptio
String seedRelativeUrl = "/fromConfig";
transformer.setSeedRelativeUrls(Set.of(seedRelativeUrl));
Set<String> urls = testFetchUrls(DEFAULT_RELATIVE_URL, List.of(seedRelativeUrl));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUTE_URL)));
verifyNoInteractions(redirectsProvider);
}

Expand Down Expand Up @@ -261,7 +262,7 @@ private Set<String> runUrlFetching(String mainAppPage, TableProperties tableProp
{
LinkCrawler linkCrawler = ((LinkCrawlerFactory) factory).newInstance();
HtmlParseData htmlParseData = new HtmlParseData();
String outgoingUrl = UriUtils.buildNewUrl(mainAppPage, OUTGOING_ABSOLUT_URL).toString();
String outgoingUrl = UriUtils.buildNewUrl(mainAppPage, OUTGOING_RELATIVE_URL).toString();
htmlParseData.setOutgoingUrls(Set.of(createWebUrl(outgoingUrl)));
String crawlingPageUrl = UriUtils.buildNewUrl(mainAppPage, CRAWLING_RELATIVE_URL).toString();
WebURL crawlingPageWebUrl = createWebUrl(crawlingPageUrl);
Expand Down
11 changes: 7 additions & 4 deletions vividus-util/src/main/java/org/vividus/util/UriUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -317,15 +317,18 @@ public static URI buildNewUrl(URI url, String relativeUrl)
return new URI(url.getScheme(), url.getSchemeSpecificPart(), decodedFragment);
}

URI parsedRelativeUrl = new URI(removeFragment(decodeUrl(normalizedRelativeUrl), decodedFragment));
String path = StringUtils.repeat(SLASH, indexOfFirstNonSlashChar - 1) + parsedRelativeUrl.getRawPath();
String parsedRelativeUrl = removeFragment(decodeUrl(normalizedRelativeUrl), decodedFragment);
String[] parts = StringUtils.split(parsedRelativeUrl, "?", 2);
String rawPath = parts.length > 0 ? parts[0] : "";
String query = parts.length > 1 ? parts[1] : null;
String path = StringUtils.repeat(SLASH, indexOfFirstNonSlashChar - 1) + rawPath;
if (!path.isEmpty() && path.charAt(0) != '/')
{
throw new IllegalArgumentException(String
.format("Relative path '%s' for '%s' should start with forward slash ('/')", path, url));
}
String uriAsString = createUriAsString(url.getScheme(), url.getRawAuthority(), path,
parsedRelativeUrl.getQuery(), decodedFragment);
String uriAsString = createUriAsString(url.getScheme(), url.getRawAuthority(), path, query,
decodedFragment);

return buildUrl(uriAsString, decodedFragment);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ void testCreateUriWithException()
"https://test:pas%40dsad@host.com, '', https://test:pas%40dsad@host.com",
"http://localhost:4200, /m/cool#%E7%94%A2%E5%93%81%E6%A6%82%E8%A6%BD%20overview, http://localhost:4200/m/cool#%E7%94%A2%E5%93%81%E6%A6%82%E8%A6%BD%20overview",
"http://localhost:4200, /m/cool#產品概覽 overview, http://localhost:4200/m/cool#%E7%94%A2%E5%93%81%E6%A6%82%E8%A6%BD%20overview",
"https://somehost.il/, /%D7%92'%D7%95%D7%A0%D7%A1%D7%95%D7%A0%D7%A1-%D7%98%D7%99%D7%A4%D7%95%D7%AA-%D7%A9%D7%9C-%D7%91%D7%A8%D7%A7%20%D7%AA%D7%A8%D7%A1%D7%99%D7%A1-%D7%9E%D7%A8%D7%9B%D7%9A-%D7%A9%D7%99%D7%A2%D7%A8-%D7%9C%D7%99%D7%9C%D7%93%D7%99%D7%9D, https://somehost.il/%D7%92'%D7%95%D7%A0%D7%A1%D7%95%D7%A0%D7%A1-%D7%98%D7%99%D7%A4%D7%95%D7%AA-%D7%A9%D7%9C-%D7%91%D7%A8%D7%A7%20%D7%AA%D7%A8%D7%A1%D7%99%D7%A1-%D7%9E%D7%A8%D7%9B%D7%9A-%D7%A9%D7%99%D7%A2%D7%A8-%D7%9C%D7%99%D7%9C%D7%93%D7%99%D7%9D"
// CHECKSTYLE:ON
})
void testBuildNewUri(String baseUrl, String relativeUrl, String expectedUrl)
Expand Down

0 comments on commit 108f14a

Please sign in to comment.