From 0a1ce96bee867b3129e336b5b8620a2eaf8ec514 Mon Sep 17 00:00:00 2001 From: "Chrome Release Bot (LUCI)" Date: Thu, 21 Mar 2024 17:19:53 +0000 Subject: [PATCH] Publish DEPS for 114.0.5735.358 git-subtree-dir: url git-subtree-split: 1759c6ae9316996b9f150c0ce9d0ca78a3d15c02 --- BUILD.gn | 405 +++ DEPS | 18 + DIR_METADATA | 11 + OWNERS | 9 + README.md | 75 + android/OWNERS | 1 + android/gurl_android.cc | 160 + android/gurl_android.h | 36 + android/gurl_java_test_helper.cc | 72 + android/java/src/org/chromium/url/GURL.java | 413 +++ .../src/org/chromium/url/IDNStringUtil.java | 33 + android/java/src/org/chromium/url/Origin.java | 114 + android/java/src/org/chromium/url/Parsed.java | 141 + android/java/src/org/chromium/url/URI.java | 61 + android/javatests/DEPS | 3 + .../src/org/chromium/url/GURLJavaTest.java | 314 ++ .../org/chromium/url/GURLJavaTestHelper.java | 34 + .../org/chromium/url/JUnitTestGURLsTest.java | 73 + .../src/org/chromium/url/OriginJavaTest.java | 99 + .../chromium/url/OriginJavaTestHelper.java | 23 + .../src/org/chromium/url/ShadowGURLTest.java | 70 + android/origin_android.cc | 87 + android/origin_java_test_helper.cc | 37 + android/parsed_android.cc | 96 + android/parsed_android.h | 22 + android/robolectric_test_main.cc | 15 + .../src/org/chromium/url/JUnitTestGURLs.java | 174 ++ .../java/src/org/chromium/url/ShadowGURL.java | 62 + features.gni | 16 + gurl.cc | 578 ++++ gurl.h | 534 ++++ gurl_abstract_tests.h | 119 + gurl_fuzzer.cc | 89 + gurl_fuzzer.dict | 432 +++ gurl_unittest.cc | 1180 +++++++ ipc/BUILD.gn | 38 + ipc/OWNERS | 2 + ipc/url_ipc_export.h | 29 + ipc/url_param_traits.cc | 56 + ipc/url_param_traits.h | 33 + ipc/url_param_traits_unittest.cc | 159 + mojom/BUILD.gn | 141 + mojom/DEPS | 3 + mojom/DIR_METADATA | 11 + mojom/OWNERS | 4 + mojom/origin.mojom | 19 + mojom/origin_mojom_traits.cc | 34 + mojom/origin_mojom_traits.h | 39 + mojom/scheme_host_port.mojom | 13 + mojom/scheme_host_port_mojom_traits.cc | 27 + mojom/scheme_host_port_mojom_traits.h | 30 + .../scheme_host_port_mojom_traits_unittest.cc | 36 + mojom/url.mojom | 13 + mojom/url_gurl_mojom_traits.cc | 40 + mojom/url_gurl_mojom_traits.h | 25 + mojom/url_gurl_mojom_traits_unittest.cc | 209 ++ mojom/url_test.mojom | 16 + origin.cc | 482 +++ origin.h | 496 +++ origin_abstract_tests.cc | 104 + origin_abstract_tests.h | 527 ++++ origin_unittest.cc | 777 +++++ run_all_perftests.cc | 14 + run_all_unittests.cc | 27 + scheme_host_port.cc | 278 ++ scheme_host_port.h | 173 ++ scheme_host_port_unittest.cc | 294 ++ third_party/mozilla/LICENSE.txt | 65 + third_party/mozilla/README.chromium | 8 + third_party/mozilla/url_parse.cc | 963 ++++++ third_party/mozilla/url_parse.h | 377 +++ url_canon.cc | 15 + url_canon.h | 1037 +++++++ url_canon_etc.cc | 428 +++ url_canon_filesystemurl.cc | 135 + url_canon_fileurl.cc | 247 ++ url_canon_host.cc | 442 +++ url_canon_icu.cc | 114 + url_canon_icu.h | 41 + url_canon_icu_unittest.cc | 168 + url_canon_internal.cc | 502 +++ url_canon_internal.h | 471 +++ url_canon_internal_file.h | 135 + url_canon_ip.cc | 690 +++++ url_canon_ip.h | 60 + url_canon_mailtourl.cc | 127 + url_canon_path.cc | 474 +++ url_canon_pathurl.cc | 144 + url_canon_query.cc | 149 + url_canon_relative.cc | 623 ++++ url_canon_stdstring.cc | 30 + url_canon_stdstring.h | 132 + url_canon_stdurl.cc | 209 ++ url_canon_unittest.cc | 2748 +++++++++++++++++ url_constants.cc | 61 + url_constants.h | 70 + url_features.cc | 35 + url_features.h | 33 + url_file.h | 101 + url_idna_icu.cc | 144 + url_idna_icu_alternatives_android.cc | 40 + url_idna_icu_alternatives_ios.mm | 28 + url_parse_file.cc | 198 ++ url_parse_internal.h | 96 + url_parse_perftest.cc | 135 + url_parse_unittest.cc | 687 +++++ url_test_utils.h | 39 + url_util.cc | 933 ++++++ url_util.h | 314 ++ url_util_internal.h | 23 + url_util_unittest.cc | 631 ++++ 111 files changed, 23857 insertions(+) create mode 100644 BUILD.gn create mode 100644 DEPS create mode 100644 DIR_METADATA create mode 100644 OWNERS create mode 100644 README.md create mode 100644 android/OWNERS create mode 100644 android/gurl_android.cc create mode 100644 android/gurl_android.h create mode 100644 android/gurl_java_test_helper.cc create mode 100644 android/java/src/org/chromium/url/GURL.java create mode 100644 android/java/src/org/chromium/url/IDNStringUtil.java create mode 100644 android/java/src/org/chromium/url/Origin.java create mode 100644 android/java/src/org/chromium/url/Parsed.java create mode 100644 android/java/src/org/chromium/url/URI.java create mode 100644 android/javatests/DEPS create mode 100644 android/javatests/src/org/chromium/url/GURLJavaTest.java create mode 100644 android/javatests/src/org/chromium/url/GURLJavaTestHelper.java create mode 100644 android/javatests/src/org/chromium/url/JUnitTestGURLsTest.java create mode 100644 android/javatests/src/org/chromium/url/OriginJavaTest.java create mode 100644 android/javatests/src/org/chromium/url/OriginJavaTestHelper.java create mode 100644 android/junit/src/org/chromium/url/ShadowGURLTest.java create mode 100644 android/origin_android.cc create mode 100644 android/origin_java_test_helper.cc create mode 100644 android/parsed_android.cc create mode 100644 android/parsed_android.h create mode 100644 android/robolectric_test_main.cc create mode 100644 android/test/java/src/org/chromium/url/JUnitTestGURLs.java create mode 100644 android/test/java/src/org/chromium/url/ShadowGURL.java create mode 100644 features.gni create mode 100644 gurl.cc create mode 100644 gurl.h create mode 100644 gurl_abstract_tests.h create mode 100644 gurl_fuzzer.cc create mode 100644 gurl_fuzzer.dict create mode 100644 gurl_unittest.cc create mode 100644 ipc/BUILD.gn create mode 100644 ipc/OWNERS create mode 100644 ipc/url_ipc_export.h create mode 100644 ipc/url_param_traits.cc create mode 100644 ipc/url_param_traits.h create mode 100644 ipc/url_param_traits_unittest.cc create mode 100644 mojom/BUILD.gn create mode 100644 mojom/DEPS create mode 100644 mojom/DIR_METADATA create mode 100644 mojom/OWNERS create mode 100644 mojom/origin.mojom create mode 100644 mojom/origin_mojom_traits.cc create mode 100644 mojom/origin_mojom_traits.h create mode 100644 mojom/scheme_host_port.mojom create mode 100644 mojom/scheme_host_port_mojom_traits.cc create mode 100644 mojom/scheme_host_port_mojom_traits.h create mode 100644 mojom/scheme_host_port_mojom_traits_unittest.cc create mode 100644 mojom/url.mojom create mode 100644 mojom/url_gurl_mojom_traits.cc create mode 100644 mojom/url_gurl_mojom_traits.h create mode 100644 mojom/url_gurl_mojom_traits_unittest.cc create mode 100644 mojom/url_test.mojom create mode 100644 origin.cc create mode 100644 origin.h create mode 100644 origin_abstract_tests.cc create mode 100644 origin_abstract_tests.h create mode 100644 origin_unittest.cc create mode 100644 run_all_perftests.cc create mode 100644 run_all_unittests.cc create mode 100644 scheme_host_port.cc create mode 100644 scheme_host_port.h create mode 100644 scheme_host_port_unittest.cc create mode 100644 third_party/mozilla/LICENSE.txt create mode 100644 third_party/mozilla/README.chromium create mode 100644 third_party/mozilla/url_parse.cc create mode 100644 third_party/mozilla/url_parse.h create mode 100644 url_canon.cc create mode 100644 url_canon.h create mode 100644 url_canon_etc.cc create mode 100644 url_canon_filesystemurl.cc create mode 100644 url_canon_fileurl.cc create mode 100644 url_canon_host.cc create mode 100644 url_canon_icu.cc create mode 100644 url_canon_icu.h create mode 100644 url_canon_icu_unittest.cc create mode 100644 url_canon_internal.cc create mode 100644 url_canon_internal.h create mode 100644 url_canon_internal_file.h create mode 100644 url_canon_ip.cc create mode 100644 url_canon_ip.h create mode 100644 url_canon_mailtourl.cc create mode 100644 url_canon_path.cc create mode 100644 url_canon_pathurl.cc create mode 100644 url_canon_query.cc create mode 100644 url_canon_relative.cc create mode 100644 url_canon_stdstring.cc create mode 100644 url_canon_stdstring.h create mode 100644 url_canon_stdurl.cc create mode 100644 url_canon_unittest.cc create mode 100644 url_constants.cc create mode 100644 url_constants.h create mode 100644 url_features.cc create mode 100644 url_features.h create mode 100644 url_file.h create mode 100644 url_idna_icu.cc create mode 100644 url_idna_icu_alternatives_android.cc create mode 100644 url_idna_icu_alternatives_ios.mm create mode 100644 url_parse_file.cc create mode 100644 url_parse_internal.h create mode 100644 url_parse_perftest.cc create mode 100644 url_parse_unittest.cc create mode 100644 url_test_utils.h create mode 100644 url_util.cc create mode 100644 url_util.h create mode 100644 url_util_internal.h create mode 100644 url_util_unittest.cc diff --git a/BUILD.gn b/BUILD.gn new file mode 100644 index 00000000000..b5d6f606382 --- /dev/null +++ b/BUILD.gn @@ -0,0 +1,405 @@ +# Copyright 2013 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import("//build/buildflag_header.gni") +import("//testing/libfuzzer/fuzzer_test.gni") +import("//testing/test.gni") +import("features.gni") + +import("//build/config/cronet/config.gni") + +if (is_android || is_robolectric) { + import("//build/config/android/rules.gni") +} + +buildflag_header("buildflags") { + header = "buildflags.h" + flags = [ "USE_PLATFORM_ICU_ALTERNATIVES=$use_platform_icu_alternatives" ] +} + +component("url") { + sources = [ + "gurl.cc", + "gurl.h", + "origin.cc", + "origin.h", + "scheme_host_port.cc", + "scheme_host_port.h", + "third_party/mozilla/url_parse.cc", + "third_party/mozilla/url_parse.h", + "url_canon.cc", + "url_canon.h", + "url_canon_etc.cc", + "url_canon_filesystemurl.cc", + "url_canon_fileurl.cc", + "url_canon_host.cc", + "url_canon_internal.cc", + "url_canon_internal.h", + "url_canon_internal_file.h", + "url_canon_ip.cc", + "url_canon_ip.h", + "url_canon_mailtourl.cc", + "url_canon_path.cc", + "url_canon_pathurl.cc", + "url_canon_query.cc", + "url_canon_relative.cc", + "url_canon_stdstring.cc", + "url_canon_stdstring.h", + "url_canon_stdurl.cc", + "url_constants.cc", + "url_constants.h", + "url_features.cc", + "url_features.h", + "url_file.h", + "url_parse_file.cc", + "url_parse_internal.h", + "url_util.cc", + "url_util.h", + "url_util_internal.h", + ] + + defines = [ "IS_URL_IMPL" ] + + public_deps = [ "//base" ] + + deps = [ "//base/third_party/dynamic_annotations" ] + + if (is_win) { + # Don't conflict with Windows' "url.dll". + output_name = "url_lib" + } + + # ICU support. + if (use_platform_icu_alternatives) { + if (is_android) { + sources += [ "url_idna_icu_alternatives_android.cc" ] + deps += [ + ":buildflags", + ":url_java", + ":url_jni_headers", + "//base", + "//base/third_party/dynamic_annotations", + ] + } else if (is_ios) { + sources += [ "url_idna_icu_alternatives_ios.mm" ] + } else { + assert(false, + "ICU alternative is not implemented for platform: " + target_os) + } + } else { + # Use ICU. + sources += [ + "url_canon_icu.cc", + "url_canon_icu.h", + "url_idna_icu.cc", + ] + deps += [ + "//base:i18n", + "//third_party/icu", + ] + } +} + +if (is_android || is_robolectric) { + generate_jni("url_jni_headers") { + sources = [ + "android/java/src/org/chromium/url/IDNStringUtil.java", + "android/java/src/org/chromium/url/Origin.java", + ] + } + + generate_jni("gurl_jni_headers") { + sources = [ + "android/java/src/org/chromium/url/GURL.java", + "android/java/src/org/chromium/url/Parsed.java", + ] + } + + source_set("gurl_android") { + sources = [ + "android/gurl_android.cc", + "android/gurl_android.h", + "android/parsed_android.cc", + "android/parsed_android.h", + ] + + deps = [ + ":gurl_jni_headers", + ":url", + "//base:base", + ] + + if (is_robolectric) { + # Make jni.h available. + configs += [ "//third_party/jdk" ] + } + } + + static_library("origin_android") { + sources = [ "android/origin_android.cc" ] + + deps = [ + ":gurl_android", + ":url", + ":url_jni_headers", + "//base", + ] + } +} + +if (is_android) { + android_library("url_java") { + sources = [ "android/java/src/org/chromium/url/IDNStringUtil.java" ] + deps = [ "//base:jni_java" ] + } +} + +if (is_android && !is_cronet_build) { + android_library("gurl_java") { + sources = [ + "android/java/src/org/chromium/url/GURL.java", + "android/java/src/org/chromium/url/Parsed.java", + "android/java/src/org/chromium/url/URI.java", + ] + deps = [ + "//base:base_java", + "//base:jni_java", + "//build/android:build_java", + "//third_party/android_deps:com_google_errorprone_error_prone_annotations_java", + "//third_party/androidx:androidx_annotation_annotation_java", + "//url/mojom:url_mojom_gurl_java", + ] + annotation_processor_deps = [ "//base/android/jni_generator:jni_processor" ] + } + + android_library("origin_java") { + sources = [ "android/java/src/org/chromium/url/Origin.java" ] + deps = [ + ":gurl_java", + "//base:jni_java", + "//build/android:build_java", + "//mojo/public/java:bindings_java", + "//mojo/public/mojom/base:base_java", + "//url/mojom:url_mojom_origin_java", + ] + annotation_processor_deps = [ "//base/android/jni_generator:jni_processor" ] + } +} + +source_set("url_test_support") { + testonly = true + + sources = [ + "gurl_abstract_tests.h", + "origin_abstract_tests.cc", + "origin_abstract_tests.h", + ] + + public_deps = [ + ":url", + "//base", + "//base/test:test_support", + "//testing/gmock", + "//testing/gtest", + ] +} + +test("url_unittests") { + sources = [ + "gurl_unittest.cc", + "origin_unittest.cc", + "run_all_unittests.cc", + "scheme_host_port_unittest.cc", + "url_canon_icu_unittest.cc", + "url_canon_unittest.cc", + "url_parse_unittest.cc", + "url_test_utils.h", + "url_util_unittest.cc", + ] + + deps = [ + ":url", + ":url_test_support", + "//base", + "//base/test:test_support", + "//testing/gmock", + "//testing/gtest", + ] + + if (use_platform_icu_alternatives) { + # Unit tests that are not supported by the current ICU alternatives on Android. + if (is_android) { + sources -= [ + "url_canon_icu_unittest.cc", + "url_canon_unittest.cc", + ] + deps += [ ":url_java" ] + } + + # Unit tests that are not supported by the current ICU alternatives on iOS. + if (is_ios) { + sources -= [ + "origin_unittest.cc", + "scheme_host_port_unittest.cc", + "url_canon_icu_unittest.cc", + "url_canon_unittest.cc", + ] + } + } else { # !use_platform_icu_alternatives + deps += [ "//third_party/icu:icuuc" ] + } + + if (!is_ios && !is_cronet_build) { + sources += [ + "mojom/scheme_host_port_mojom_traits_unittest.cc", + "mojom/url_gurl_mojom_traits_unittest.cc", + ] + deps += [ + "//mojo/core/embedder", + "//mojo/public/cpp/test_support:test_utils", + "//url/ipc:url_ipc_unittests", + "//url/mojom:mojom_traits", + "//url/mojom:test_url_mojom_gurl", + ] + } +} + +test("url_perftests") { + sources = [ + "run_all_perftests.cc", + "url_parse_perftest.cc", + ] + + deps = [ + ":url", + "//base", + "//base/test:test_support", + "//testing/gtest", + ] +} + +fuzzer_test("gurl_fuzzer") { + sources = [ "gurl_fuzzer.cc" ] + deps = [ + ":url", + "//base", + "//base:i18n", + ] + dict = "gurl_fuzzer.dict" +} + +if (is_android && !is_cronet_build) { + source_set("android_test_helper") { + testonly = true + sources = [ + "android/gurl_java_test_helper.cc", + "android/origin_java_test_helper.cc", + ] + deps = [ + ":gurl_android", + ":j_test_jni_headers", + ":origin_android", + ":url", + "//base/test:test_support", + ] + } + + android_library("android_test_helper_java") { + testonly = true + annotation_processor_deps = [ "//base/android/jni_generator:jni_processor" ] + sources = [ + "android/javatests/src/org/chromium/url/GURLJavaTestHelper.java", + "android/javatests/src/org/chromium/url/OriginJavaTestHelper.java", + ] + deps = [ + ":gurl_java", + ":origin_java", + "//base:base_java_test_support", + "//base:jni_java", + ] + } + + # Targets depending on gurl_junit_test_support do not need to bypass platform + # checks. + android_library("gurl_junit_test_support") { + testonly = true + sources = [ "android/test/java/src/org/chromium/url/JUnitTestGURLs.java" ] + deps = [ ":gurl_java" ] + } + + # Unlike gurl_junit_test_support targets depending on gurl_junit_shadows must + # bypass platform checks. + robolectric_library("gurl_junit_shadows") { + sources = [ "android/test/java/src/org/chromium/url/ShadowGURL.java" ] + deps = [ + ":gurl_java", + ":gurl_junit_test_support", + ] + } + + android_library("url_java_unit_tests") { + testonly = true + sources = [ + "android/javatests/src/org/chromium/url/GURLJavaTest.java", + "android/javatests/src/org/chromium/url/JUnitTestGURLsTest.java", + "android/javatests/src/org/chromium/url/OriginJavaTest.java", + ] + deps = [ + ":android_test_helper_java", + ":gurl_java", + ":gurl_junit_test_support", + ":origin_java", + "//base:base_java", + "//base:base_java_test_support", + "//base:jni_java", + "//content/public/test/android:content_java_test_support", + "//mojo/public/mojom/base:base_java", + "//third_party/androidx:androidx_core_core_java", + "//third_party/androidx:androidx_test_runner_java", + "//third_party/junit", + "//third_party/mockito:mockito_java", + "//url/mojom:url_mojom_gurl_java", + "//url/mojom:url_mojom_origin_java", + ] + annotation_processor_deps = [ "//base/android/jni_generator:jni_processor" ] + } + + # See https://bugs.chromium.org/p/chromium/issues/detail?id=908819 for why we + # can't put 'java' in the name here. + generate_jni("j_test_jni_headers") { + testonly = true + sources = [ + "android/javatests/src/org/chromium/url/GURLJavaTestHelper.java", + "android/javatests/src/org/chromium/url/OriginJavaTestHelper.java", + ] + } + + robolectric_library("gurl_junit_tests") { + sources = [ "android/junit/src/org/chromium/url/ShadowGURLTest.java" ] + deps = [ + ":gurl_java", + ":gurl_junit_shadows", + ":gurl_junit_test_support", + "//base:base_java_test_support", + "//base:base_junit_test_support", + "//base/test:test_support_java", + "//third_party/junit", + ] + } +} + +if (is_robolectric) { + # Use this in robolectric_binary() targets if you just need GURL and //base + # functionality. Otherwise, define a custom shared_library(). + shared_library("libgurl_robolectric") { + sources = [ "android/robolectric_test_main.cc" ] + deps = [ + "//base", + "//url:gurl_android", + ] + + # Make jni.h available. + configs += [ "//third_party/jdk" ] + } +} diff --git a/DEPS b/DEPS new file mode 100644 index 00000000000..166f6a26c1e --- /dev/null +++ b/DEPS @@ -0,0 +1,18 @@ +include_rules = [ + # Limit files that can depend on icu. + "-base/i18n", + "-third_party/icu", +] + +specific_include_rules = { + "gurl_fuzzer.cc": [ + "+base/i18n", + ], + "url_(canon|idna)_icu(\.cc|_unittest\.cc)": [ + "+base/i18n", + "+third_party/icu", + ], + "run_all_unittests\.cc": [ + "+mojo/core/embedder", + ], +} diff --git a/DIR_METADATA b/DIR_METADATA new file mode 100644 index 00000000000..16c80be74ea --- /dev/null +++ b/DIR_METADATA @@ -0,0 +1,11 @@ +# Metadata information for this directory. +# +# For more information on DIR_METADATA files, see: +# https://source.chromium.org/chromium/infra/infra/+/main:go/src/infra/tools/dirmd/README.md +# +# For the schema of this file, see Metadata message: +# https://source.chromium.org/chromium/infra/infra/+/main:go/src/infra/tools/dirmd/proto/dir_metadata.proto + +monorail { + component: "Blink>Network" +} \ No newline at end of file diff --git a/OWNERS b/OWNERS new file mode 100644 index 00000000000..58f2e5fcef3 --- /dev/null +++ b/OWNERS @@ -0,0 +1,9 @@ +set noparent +# NOTE: keep this in sync with lsc-owners-override@chromium.org owners +# by emailing lsc-policy@chromium.org when this list changes. +csharrison@chromium.org +dcheng@chromium.org +mkwst@chromium.org +timothygu@chromium.org +# NOTE: keep this in sync with lsc-owners-override@chromium.org owners +# by emailing lsc-policy@chromium.org when this list changes. diff --git a/README.md b/README.md new file mode 100644 index 00000000000..8d63fda613d --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +# Chrome's URL library + +## Layers + +There are several conceptual layers in this directory. Going from the lowest +level up, they are: + +### Parsing + +The `url_parse.*` files are the parser. This code does no string +transformations. Its only job is to take an input string and split out the +components of the URL as best as it can deduce them, for a given type of URL. +Parsing can never fail, it will take its best guess. This layer does not +have logic for determining the type of URL parsing to apply, that needs to +be applied at a higher layer (the "util" layer below). + +Because the parser code is derived (_very_ distantly) from some code in +Mozilla, some of the parser files are in `url/third_party/mozilla/`. + +The main header to include for calling the parser is +`url/third_party/mozilla/url_parse.h`. + +### Canonicalization + +The `url_canon*` files are the canonicalizer. This code will transform specific +URL components or specific types of URLs into a standard form. For some +dangerous or invalid data, the canonicalizer will report that a URL is invalid, +although it will always try its best to produce output (so the calling code +can, for example, show the user an error that the URL is invalid). The +canonicalizer attempts to provide as consistent a representation as possible +without changing the meaning of a URL. + +The canonicalizer layer is designed to be independent of the string type of +the embedder, so all string output is done through a `CanonOutput` wrapper +object. An implementation for `std::string` output is provided in +`url_canon_stdstring.h`. + +The main header to include for calling the canonicalizer is +`url/url_canon.h`. + +### Utility + +The `url_util*` files provide a higher-level wrapper around the parser and +canonicalizer. While it can be called directly, it is designed to be the +foundation for writing URL wrapper objects (The GURL later and Blink's KURL +object use the Utility layer to implement the low-level logic). + +The Utility code makes decisions about URL types and calls the correct parsing +and canonicalzation functions for those types. It provides an interface to +register application-specific schemes that have specific requirements. +Sharing this loigic between KURL and GURL is important so that URLs are +handled consistently across the application. + +The main header to include is `url/url_util.h`. + +### Google URL (GURL) and Origin + +At the highest layer, a C++ object for representing URLs is provided. This +object uses STL. Most uses need only this layer. Include `url/gurl.h`. + +Also at this layer is also the Origin object which exists to make security +decisions on the web. Include `url/origin.h`. + +## Historical background + +This code was originally a separate library that was designed to be embedded +into both Chrome (which uses STL) and WebKit (which didn't use any STL at the +time). As a result, the parsing, canonicalization, and utility code could +not use STL, or any other common code in Chromium like base. + +When WebKit was forked into the Chromium repo and renamed Blink, this +restriction has been relaxed somewhat. Blink still provides its own URL object +using its own string type, so the insulation that the Utility layer provides is +still useful. But some STL strings and calls to base functions have gradually +been added in places where doing so is possible. diff --git a/android/OWNERS b/android/OWNERS new file mode 100644 index 00000000000..c19374d6fb8 --- /dev/null +++ b/android/OWNERS @@ -0,0 +1 @@ +mthiesse@chromium.org diff --git a/android/gurl_android.cc b/android/gurl_android.cc new file mode 100644 index 00000000000..bf398a13218 --- /dev/null +++ b/android/gurl_android.cc @@ -0,0 +1,160 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/android/gurl_android.h" + +#include + +#include +#include +#include + +#include "base/android/jni_android.h" +#include "base/android/jni_string.h" +#include "base/functional/bind.h" +#include "base/functional/callback.h" +#include "base/memory/ptr_util.h" +#include "url/android/parsed_android.h" +#include "url/gurl_jni_headers/GURL_jni.h" +#include "url/third_party/mozilla/url_parse.h" + +using base::android::AttachCurrentThread; +using base::android::JavaParamRef; +using base::android::JavaRef; +using base::android::ScopedJavaLocalRef; + +namespace url { + +namespace { + +static GURL FromJString(JNIEnv* env, const JavaRef& uri) { + if (!uri) + return GURL(); + return GURL(base::android::ConvertJavaStringToUTF16(env, uri)); +} + +static std::unique_ptr FromJavaGURL(JNIEnv* env, + const JavaRef& j_spec, + bool is_valid, + jlong parsed_ptr) { + Parsed* parsed = reinterpret_cast(parsed_ptr); + const std::string& spec = ConvertJavaStringToUTF8(env, j_spec); + std::unique_ptr gurl = + std::make_unique(spec.data(), parsed->Length(), *parsed, is_valid); + delete parsed; + return gurl; +} + +static void InitFromGURL(JNIEnv* env, + const GURL& gurl, + const JavaRef& target) { + Java_GURL_init( + env, target, + base::android::ConvertUTF8ToJavaString(env, gurl.possibly_invalid_spec()), + gurl.is_valid(), + ParsedAndroid::InitFromParsed(env, + gurl.parsed_for_possibly_invalid_spec())); +} + +// As |GetArrayLength| makes no guarantees about the returned value (e.g., it +// may be -1 if |array| is not a valid Java array), provide a safe wrapper +// that always returns a valid, non-negative size. +template +size_t SafeGetArrayLength(JNIEnv* env, const JavaRef& jarray) { + DCHECK(jarray); + jsize length = env->GetArrayLength(jarray.obj()); + DCHECK_GE(length, 0) << "Invalid array length: " << length; + return static_cast(std::max(0, length)); +} + +} // namespace + +// static +std::unique_ptr GURLAndroid::ToNativeGURL( + JNIEnv* env, + const base::android::JavaRef& j_gurl) { + return base::WrapUnique( + reinterpret_cast(Java_GURL_toNativeGURL(env, j_gurl))); +} + +void GURLAndroid::JavaGURLArrayToGURLVector( + JNIEnv* env, + const base::android::JavaRef& array, + std::vector* out) { + DCHECK(out); + DCHECK(out->empty()); + if (!array) + return; + size_t len = SafeGetArrayLength(env, array); + for (size_t i = 0; i < len; ++i) { + ScopedJavaLocalRef j_gurl( + env, static_cast(env->GetObjectArrayElement(array.obj(), i))); + out->emplace_back( + *reinterpret_cast(Java_GURL_toNativeGURL(env, j_gurl))); + } +} + +// static +ScopedJavaLocalRef GURLAndroid::FromNativeGURL(JNIEnv* env, + const GURL& gurl) { + ScopedJavaLocalRef j_gurl = Java_GURL_Constructor(env); + InitFromGURL(env, gurl, j_gurl); + return j_gurl; +} + +// static +ScopedJavaLocalRef GURLAndroid::EmptyGURL(JNIEnv* env) { + return Java_GURL_emptyGURL(env); +} + +// static +ScopedJavaLocalRef GURLAndroid::ToJavaArrayOfGURLs( + JNIEnv* env, + base::span> v) { + jclass clazz = org_chromium_url_GURL_clazz(env); + DCHECK(clazz); + jobjectArray joa = env->NewObjectArray(v.size(), clazz, nullptr); + base::android::CheckException(env); + + for (size_t i = 0; i < v.size(); ++i) { + env->SetObjectArrayElement(joa, i, v[i].obj()); + } + return ScopedJavaLocalRef(env, joa); +} + +static void JNI_GURL_GetOrigin(JNIEnv* env, + const JavaParamRef& j_spec, + jboolean is_valid, + jlong parsed_ptr, + const JavaParamRef& target) { + std::unique_ptr gurl = FromJavaGURL(env, j_spec, is_valid, parsed_ptr); + InitFromGURL(env, gurl->DeprecatedGetOriginAsURL(), target); +} + +static jboolean JNI_GURL_DomainIs(JNIEnv* env, + const JavaParamRef& j_spec, + jboolean is_valid, + jlong parsed_ptr, + const JavaParamRef& j_domain) { + std::unique_ptr gurl = FromJavaGURL(env, j_spec, is_valid, parsed_ptr); + const std::string& domain = ConvertJavaStringToUTF8(env, j_domain); + return gurl->DomainIs(domain); +} + +static void JNI_GURL_Init(JNIEnv* env, + const base::android::JavaParamRef& uri, + const base::android::JavaParamRef& target) { + const GURL& gurl = FromJString(env, uri); + InitFromGURL(env, gurl, target); +} + +static jlong JNI_GURL_CreateNative(JNIEnv* env, + const JavaParamRef& j_spec, + jboolean is_valid, + jlong parsed_ptr) { + return reinterpret_cast( + FromJavaGURL(env, j_spec, is_valid, parsed_ptr).release()); +} + +} // namespace url diff --git a/android/gurl_android.h b/android/gurl_android.h new file mode 100644 index 00000000000..8b356070da6 --- /dev/null +++ b/android/gurl_android.h @@ -0,0 +1,36 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_ANDROID_GURL_ANDROID_H_ +#define URL_ANDROID_GURL_ANDROID_H_ + +#include + +#include "base/android/scoped_java_ref.h" +#include "base/containers/span.h" +#include "url/gurl.h" + +namespace url { + +class GURLAndroid { + public: + static std::unique_ptr ToNativeGURL( + JNIEnv* env, + const base::android::JavaRef& j_gurl); + static base::android::ScopedJavaLocalRef FromNativeGURL( + JNIEnv* env, + const GURL& gurl); + static base::android::ScopedJavaLocalRef EmptyGURL(JNIEnv* env); + static base::android::ScopedJavaLocalRef ToJavaArrayOfGURLs( + JNIEnv* env, + base::span> v); + static void JavaGURLArrayToGURLVector( + JNIEnv* env, + const base::android::JavaRef& gurl_array, + std::vector* out); +}; + +} // namespace url + +#endif // URL_ANDROID_GURL_ANDROID_H_ diff --git a/android/gurl_java_test_helper.cc b/android/gurl_java_test_helper.cc new file mode 100644 index 00000000000..5ad1b44a6ac --- /dev/null +++ b/android/gurl_java_test_helper.cc @@ -0,0 +1,72 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "base/android/jni_android.h" +#include "base/android/jni_string.h" +#include "base/test/icu_test_util.h" +#include "url/android/gurl_android.h" +#include "url/gurl.h" +#include "url/j_test_jni_headers/GURLJavaTestHelper_jni.h" + +using base::android::AttachCurrentThread; + +namespace url { + +static void JNI_GURLJavaTestHelper_InitializeICU(JNIEnv* env) { + base::test::InitializeICUForTesting(); +} + +static void JNI_GURLJavaTestHelper_TestGURLEquivalence(JNIEnv* env) { + const char* cases[] = { + // Common Standard URLs. + "https://www.google.com", + "https://www.google.com/", + "https://www.google.com/maps.htm", + "https://www.google.com/maps/", + "https://www.google.com/index.html", + "https://www.google.com/index.html?q=maps", + "https://www.google.com/index.html#maps/", + "https://foo:bar@www.google.com/maps.htm", + "https://www.google.com/maps/au/index.html", + "https://www.google.com/maps/au/north", + "https://www.google.com/maps/au/north/", + "https://www.google.com/maps/au/index.html?q=maps#fragment/", + "http://www.google.com:8000/maps/au/index.html?q=maps#fragment/", + "https://www.google.com/maps/au/north/?q=maps#fragment", + "https://www.google.com/maps/au/north?q=maps#fragment", + // Less common standard URLs. + "filesystem:http://www.google.com/temporary/bar.html?baz=22", + "file:///temporary/bar.html?baz=22", + "ftp://foo/test/index.html", + "gopher://foo/test/index.html", + "ws://foo/test/index.html", + // Non-standard, + "chrome://foo/bar.html", + "httpa://foo/test/index.html", + "blob:https://foo.bar/test/index.html", + "about:blank", + "data:foobar", + "scheme:opaque_data", + // Invalid URLs. + "foobar", + }; + for (const char* uri : cases) { + GURL gurl(uri); + base::android::ScopedJavaLocalRef j_gurl = + Java_GURLJavaTestHelper_createGURL( + env, base::android::ConvertUTF8ToJavaString(env, uri)); + std::unique_ptr gurl2 = GURLAndroid::ToNativeGURL(env, j_gurl); + if (gurl != *gurl2) { + std::stringstream ss; + ss << "GURL not equivalent: " << gurl << ", " << *gurl2; + env->ThrowNew(env->FindClass("java/lang/AssertionError"), + ss.str().data()); + return; + } + } +} + +} // namespace url diff --git a/android/java/src/org/chromium/url/GURL.java b/android/java/src/org/chromium/url/GURL.java new file mode 100644 index 00000000000..34bd924951a --- /dev/null +++ b/android/java/src/org/chromium/url/GURL.java @@ -0,0 +1,413 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import android.os.SystemClock; +import android.text.TextUtils; + +import androidx.annotation.Nullable; +import androidx.annotation.VisibleForTesting; + +import com.google.errorprone.annotations.DoNotMock; + +import org.chromium.base.Log; +import org.chromium.base.ThreadUtils; +import org.chromium.base.annotations.CalledByNative; +import org.chromium.base.annotations.JNINamespace; +import org.chromium.base.annotations.NativeMethods; +import org.chromium.base.library_loader.LibraryLoader; +import org.chromium.base.metrics.RecordHistogram; +import org.chromium.base.task.PostTask; +import org.chromium.base.task.TaskTraits; +import org.chromium.build.annotations.MainDex; +import org.chromium.url.mojom.Url; +import org.chromium.url.mojom.UrlConstants; + +import java.util.Random; + +/** + * An immutable Java wrapper for GURL, Chromium's URL parsing library. + * + * This class is safe to use during startup, but will block on the native library being sufficiently + * loaded to use native GURL (and will not wait for content initialization). In practice it's very + * unlikely that this will actually block startup unless used extremely early, in which case you + * should probably seek an alternative solution to using GURL. + * + * The design of this class avoids destruction/finalization by caching all values necessary to + * reconstruct a GURL in Java, allowing it to be much faster in the common case and easier to use. + */ +@JNINamespace("url") +@MainDex +@DoNotMock("Create a real instance instead. For Robolectric, see JUnitTestGURLs.java") +public class GURL { + private static final String TAG = "GURL"; + /* package */ static final int SERIALIZER_VERSION = 1; + /* package */ static final char SERIALIZER_DELIMITER = '\0'; + + @FunctionalInterface + public interface ReportDebugThrowableCallback { + void run(Throwable throwable); + } + + /** + * Exception signalling that a GURL failed to parse due to an unexpected version marker in the + * serialized input. + */ + public static class BadSerializerVersionException extends RuntimeException {} + + // Right now this is only collecting reports on Canary which has a relatively small population. + private static final int DEBUG_REPORT_PERCENTAGE = 10; + private static ReportDebugThrowableCallback sReportCallback; + + // TODO(https://crbug.com/1039841): Right now we return a new String with each request for a + // GURL component other than the spec itself. Should we cache return Strings (as + // WeakReference?) so that callers can share String memory? + private String mSpec; + private boolean mIsValid; + private Parsed mParsed; + + private static class Holder { private static GURL sEmptyGURL = new GURL(""); } + + @CalledByNative + public static GURL emptyGURL() { + return Holder.sEmptyGURL; + } + + /** + * Create a new GURL. + * + * @param uri The string URI representation to parse into a GURL. + */ + public GURL(String uri) { + // Avoid a jni hop (and initializing the native library) for empty GURLs. + if (TextUtils.isEmpty(uri)) { + mSpec = ""; + mParsed = Parsed.createEmpty(); + return; + } + ensureNativeInitializedForGURL(); + getNatives().init(uri, this); + } + + @CalledByNative + protected GURL() {} + + /** + * Enables debug stack trace gathering for GURL. + */ + public static void setReportDebugThrowableCallback(ReportDebugThrowableCallback callback) { + sReportCallback = callback; + } + + /** + * Ensures that the native library is sufficiently loaded for GURL usage. + * + * This function is public so that GURL-related usage like the UrlFormatter also counts towards + * the "Startup.Android.GURLEnsureMainDexInitialized" histogram. + */ + public static void ensureNativeInitializedForGURL() { + if (LibraryLoader.getInstance().isInitialized()) return; + long time = SystemClock.elapsedRealtime(); + LibraryLoader.getInstance().ensureMainDexInitialized(); + // Record metrics only for the UI thread where the delay in loading the library is relevant. + if (ThreadUtils.runningOnUiThread()) { + // "MainDex" in name of histogram is a dated reference to when we used to have 2 + // sections of the native library, main dex and non-main dex. Maintaining name for + // consistency in metrics. + RecordHistogram.recordTimesHistogram("Startup.Android.GURLEnsureMainDexInitialized", + SystemClock.elapsedRealtime() - time); + if (sReportCallback != null && new Random().nextInt(100) < DEBUG_REPORT_PERCENTAGE) { + final Throwable throwable = + new Throwable("This is not a crash, please ignore. See crbug.com/1065377."); + // This isn't an assert, because by design this is possible, but we would prefer + // this path does not get hit more than necessary and getting stack traces from the + // wild will help find issues. + PostTask.postTask(TaskTraits.BEST_EFFORT_MAY_BLOCK, + () -> { sReportCallback.run(throwable); }); + } + } + } + + /** @return true if the GURL is null, empty, or invalid. */ + public static boolean isEmptyOrInvalid(@Nullable GURL gurl) { + return gurl == null || gurl.isEmpty() || !gurl.isValid(); + } + + @CalledByNative + private void init(String spec, boolean isValid, Parsed parsed) { + mSpec = spec; + // Ensure that the spec only contains US-ASCII or the parsed indices will be wrong. + assert mSpec.matches("\\A\\p{ASCII}*\\z"); + mIsValid = isValid; + mParsed = parsed; + } + + @CalledByNative + private long toNativeGURL() { + return getNatives().createNative(mSpec, mIsValid, mParsed.toNativeParsed()); + } + + /** + * See native GURL::is_valid(). + */ + public boolean isValid() { + return mIsValid; + } + + /** + * See native GURL::spec(). + */ + public String getSpec() { + if (isValid() || mSpec.isEmpty()) return mSpec; + assert false : "Trying to get the spec of an invalid URL!"; + return ""; + } + + /** + * @return Either a valid Spec (see {@link #getSpec}), or an empty string. + */ + public String getValidSpecOrEmpty() { + if (isValid()) return mSpec; + return ""; + } + + /** + * See native GURL::possibly_invalid_spec(). + */ + public String getPossiblyInvalidSpec() { + return mSpec; + } + + private String getComponent(int begin, int length) { + if (length <= 0) return ""; + return mSpec.substring(begin, begin + length); + } + + /** + * See native GURL::scheme(). + */ + public String getScheme() { + return getComponent(mParsed.mSchemeBegin, mParsed.mSchemeLength); + } + + /** + * See native GURL::username(). + */ + public String getUsername() { + return getComponent(mParsed.mUsernameBegin, mParsed.mUsernameLength); + } + + /** + * See native GURL::password(). + */ + public String getPassword() { + return getComponent(mParsed.mPasswordBegin, mParsed.mPasswordLength); + } + + /** + * See native GURL::host(). + */ + public String getHost() { + return getComponent(mParsed.mHostBegin, mParsed.mHostLength); + } + + /** + * See native GURL::port(). + * + * Note: Do not convert this to an integer yourself. See native GURL::IntPort(). + */ + public String getPort() { + return getComponent(mParsed.mPortBegin, mParsed.mPortLength); + } + + /** + * See native GURL::path(). + */ + public String getPath() { + return getComponent(mParsed.mPathBegin, mParsed.mPathLength); + } + + /** + * See native GURL::query(). + */ + public String getQuery() { + return getComponent(mParsed.mQueryBegin, mParsed.mQueryLength); + } + + /** + * See native GURL::ref(). + */ + public String getRef() { + return getComponent(mParsed.mRefBegin, mParsed.mRefLength); + } + + /** + * @return Whether the GURL is the empty String. + */ + public boolean isEmpty() { + return mSpec.isEmpty(); + } + + /** + * See native GURL::GetOrigin(). + */ + public GURL getOrigin() { + GURL target = new GURL(); + getOriginInternal(target); + return target; + } + + protected void getOriginInternal(GURL target) { + getNatives().getOrigin(mSpec, mIsValid, mParsed.toNativeParsed(), target); + } + + /** + * See native GURL::DomainIs(). + */ + public boolean domainIs(String domain) { + return getNatives().domainIs(mSpec, mIsValid, mParsed.toNativeParsed(), domain); + } + + @Override + public final int hashCode() { + return mSpec.hashCode(); + } + + @Override + public final boolean equals(Object other) { + if (other == this) return true; + if (!(other instanceof GURL)) return false; + return mSpec.equals(((GURL) other).mSpec); + } + + /** + * Serialize a GURL to a String, to be used with {@link GURL#deserialize(String)}. + * + * Note that a serialized GURL should only be used internally to Chrome, and should *never* be + * used if coming from an untrusted source. + * + * @return A serialzed GURL. + */ + public final String serialize() { + StringBuilder builder = new StringBuilder(); + builder.append(SERIALIZER_VERSION).append(SERIALIZER_DELIMITER); + builder.append(mIsValid).append(SERIALIZER_DELIMITER); + builder.append(mParsed.serialize()).append(SERIALIZER_DELIMITER); + builder.append(mSpec); + String serialization = builder.toString(); + return Integer.toString(serialization.length()) + SERIALIZER_DELIMITER + serialization; + } + + /** + * Deserialize a GURL serialized with {@link GURL#serialize()}. This will re-parse in case of + * version mismatch, which may trigger undesired native loading. {@see + * deserializeLatestVersionOnly} if you want to fail in case of version mismatch. + * + * This function should *never* be used on a String coming from an untrusted source. + * + * @return The deserialized GURL (or null if the input is empty). + */ + public static GURL deserialize(@Nullable String gurl) { + try { + return deserializeLatestVersionOnly(gurl); + } catch (BadSerializerVersionException be) { + // Just re-parse the GURL on version changes. + String[] tokens = gurl.split(Character.toString(SERIALIZER_DELIMITER)); + return new GURL(getSpecFromTokens(gurl, tokens)); + } catch (Exception e) { + // This is unexpected, maybe the storage got corrupted somehow? + Log.w(TAG, "Exception while deserializing a GURL: " + gurl, e); + return emptyGURL(); + } + } + + /** + * Deserialize a GURL serialized with {@link #serialize()}, throwing {@code + * BadSerializerException} if the serialized input has a version other than the latest. This + * function should never be used on a String coming from an untrusted source. + */ + public static GURL deserializeLatestVersionOnly(@Nullable String gurl) { + if (TextUtils.isEmpty(gurl)) return emptyGURL(); + String[] tokens = gurl.split(Character.toString(SERIALIZER_DELIMITER)); + + // First token MUST always be the length of the serialized data. + String length = tokens[0]; + if (gurl.length() != Integer.parseInt(length) + length.length() + 1) { + throw new IllegalArgumentException("Serialized GURL had the wrong length."); + } + + String spec = getSpecFromTokens(gurl, tokens); + // Second token MUST always be the version number. + int version = Integer.parseInt(tokens[1]); + if (version != SERIALIZER_VERSION) { + throw new BadSerializerVersionException(); + } + + boolean isValid = Boolean.parseBoolean(tokens[2]); + Parsed parsed = Parsed.deserialize(tokens, 3); + GURL result = new GURL(); + result.init(spec, isValid, parsed); + return result; + } + + private static String getSpecFromTokens(String gurl, String[] tokens) { + // Last token MUST always be the original spec. + // Special case for empty spec - it won't get its own token. + return gurl.endsWith(Character.toString(SERIALIZER_DELIMITER)) ? "" + : tokens[tokens.length - 1]; + } + + /** + * Returns the instance of {@link Natives}. The Robolectric Shadow intercepts invocations of + * this method. + * + *

Unlike {@code GURLJni.TEST_HOOKS.setInstanceForTesting}, shadowing this method doesn't + * rely on tests correctly cleaning up global state. + */ + private static Natives getNatives() { + return GURLJni.get(); + } + + /** Inits this GURL with the internal state of another GURL. */ + @VisibleForTesting + /* package */ void initForTesting(GURL gurl) { + init(gurl.mSpec, gurl.mIsValid, gurl.mParsed); + } + + /** @return A Mojom representation of this URL. */ + public Url toMojom() { + Url url = new Url(); + // See url/mojom/url_gurl_mojom_traits.cc. + url.url = TextUtils.isEmpty(getPossiblyInvalidSpec()) + || getPossiblyInvalidSpec().length() > UrlConstants.MAX_URL_CHARS + || !isValid() + ? "" + : getPossiblyInvalidSpec(); + return url; + } + + @NativeMethods + interface Natives { + /** + * Initializes the provided |target| by parsing the provided |uri|. + */ + void init(String uri, GURL target); + + /** + * Reconstructs the native GURL for this Java GURL and initializes |target| with its Origin. + */ + void getOrigin(String spec, boolean isValid, long nativeParsed, GURL target); + + /** + * Reconstructs the native GURL for this Java GURL, and calls GURL.DomainIs. + */ + boolean domainIs(String spec, boolean isValid, long nativeParsed, String domain); + + /** + * Reconstructs the native GURL for this Java GURL, returning its native pointer. + */ + long createNative(String spec, boolean isValid, long nativeParsed); + } +} diff --git a/android/java/src/org/chromium/url/IDNStringUtil.java b/android/java/src/org/chromium/url/IDNStringUtil.java new file mode 100644 index 00000000000..10957b673f3 --- /dev/null +++ b/android/java/src/org/chromium/url/IDNStringUtil.java @@ -0,0 +1,33 @@ +// Copyright 2014 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import org.chromium.base.annotations.CalledByNative; +import org.chromium.base.annotations.JNINamespace; + +import java.net.IDN; + +/** + * This class is used to convert unicode IDN domain names to ASCII, when not + * building with ICU. + */ +@JNINamespace("url::android") +public class IDNStringUtil { + /** + * Attempts to convert a Unicode string to an ASCII string using IDN rules. + * As of May 2014, the underlying Java function IDNA2003. + * @param src String to convert. + * @return: String containing only ASCII characters on success, null on + * failure. + */ + @CalledByNative + private static String idnToASCII(String src) { + try { + return IDN.toASCII(src, IDN.USE_STD3_ASCII_RULES); + } catch (Exception e) { + return null; + } + } +} \ No newline at end of file diff --git a/android/java/src/org/chromium/url/Origin.java b/android/java/src/org/chromium/url/Origin.java new file mode 100644 index 00000000000..87ce87066e1 --- /dev/null +++ b/android/java/src/org/chromium/url/Origin.java @@ -0,0 +1,114 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import org.chromium.base.annotations.CalledByNative; +import org.chromium.base.annotations.JNINamespace; +import org.chromium.base.annotations.NativeMethods; + +/** An origin is either a (scheme, host, port) tuple or is opaque. */ +@JNINamespace("url") +public class Origin { + private final String mScheme; + private final String mHost; + private final short mPort; + + private final boolean mIsOpaque; + + // Serialization of the Unguessable Token. Do not use directly. + private final long mTokenHighBits; + private final long mTokenLowBits; + + /** + * Constructs an opaque origin. + */ + public static Origin createOpaqueOrigin() { + return OriginJni.get().createOpaque(); + } + + /** + * See origin.h for many warnings about this method. + * + * Constructs an Origin from a GURL. + */ + public static Origin create(GURL gurl) { + return OriginJni.get().createFromGURL(gurl); + } + + /** + * Parses a mojo Origin into a Java analogue of the c++ Origin class. + * + * `org.chromium.url.internal.mojom.Origin`s, are provided by Mojo-generated code but not + * intended for direct use (see crbug.com/1156866). + * + * @return A Java equivalent of the c++ Origin represented by the provided mojo Origin. + */ + public Origin(org.chromium.url.internal.mojom.Origin mojoOrigin) { + mScheme = mojoOrigin.scheme; + mHost = mojoOrigin.host; + mPort = mojoOrigin.port; + if (mojoOrigin.nonceIfOpaque != null) { + mIsOpaque = true; + mTokenHighBits = mojoOrigin.nonceIfOpaque.high; + mTokenLowBits = mojoOrigin.nonceIfOpaque.low; + } else { + mIsOpaque = false; + mTokenHighBits = 0; + mTokenLowBits = 0; + } + } + + @CalledByNative + private Origin(String scheme, String host, short port, boolean isOpaque, long tokenHighBits, + long tokenLowBits) { + mScheme = scheme; + mHost = host; + mPort = port; + mIsOpaque = isOpaque; + mTokenHighBits = tokenHighBits; + mTokenLowBits = tokenLowBits; + } + + /** @return The scheme of the origin. Returns an empty string for an opaque origin. */ + public String getScheme() { + return !isOpaque() ? mScheme : ""; + } + + /** @return The host of the origin. Returns an empty string for an opaque origin. */ + public String getHost() { + return !isOpaque() ? mHost : ""; + } + + /** @return The port of the origin. Returns 0 for an opaque origin. */ + public int getPort() { + return !isOpaque() ? Short.toUnsignedInt(mPort) : 0; + } + + /** @return Whether the origin is opaque. */ + public boolean isOpaque() { + return mIsOpaque; + } + + @CalledByNative + private long toNativeOrigin() { + return OriginJni.get().createNative( + mScheme, mHost, mPort, mIsOpaque, mTokenHighBits, mTokenLowBits); + } + + @NativeMethods + interface Natives { + /** Constructs a new Opaque origin. */ + Origin createOpaque(); + + /** Constructs an Origin from a GURL. */ + Origin createFromGURL(GURL gurl); + + /** + * Reconstructs the native Origin for this Java Origin, returning its native pointer. + */ + long createNative(String scheme, String host, short port, boolean isOpaque, + long tokenHighBits, long tokenLowBits); + } +} diff --git a/android/java/src/org/chromium/url/Parsed.java b/android/java/src/org/chromium/url/Parsed.java new file mode 100644 index 00000000000..ca41cfb1f4f --- /dev/null +++ b/android/java/src/org/chromium/url/Parsed.java @@ -0,0 +1,141 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import org.chromium.base.annotations.CalledByNative; +import org.chromium.base.annotations.JNINamespace; +import org.chromium.base.annotations.NativeMethods; +import org.chromium.build.annotations.MainDex; + +/** + * A java wrapper for Parsed, GURL's internal parsed URI representation. + */ +@MainDex +@JNINamespace("url") +/* package */ class Parsed { + /* package */ final int mSchemeBegin; + /* package */ final int mSchemeLength; + /* package */ final int mUsernameBegin; + /* package */ final int mUsernameLength; + /* package */ final int mPasswordBegin; + /* package */ final int mPasswordLength; + /* package */ final int mHostBegin; + /* package */ final int mHostLength; + /* package */ final int mPortBegin; + /* package */ final int mPortLength; + /* package */ final int mPathBegin; + /* package */ final int mPathLength; + /* package */ final int mQueryBegin; + /* package */ final int mQueryLength; + /* package */ final int mRefBegin; + /* package */ final int mRefLength; + private final Parsed mInnerUrl; + private final boolean mPotentiallyDanglingMarkup; + + /* package */ static Parsed createEmpty() { + return new Parsed(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, false, null); + } + + @CalledByNative + private Parsed(int schemeBegin, int schemeLength, int usernameBegin, int usernameLength, + int passwordBegin, int passwordLength, int hostBegin, int hostLength, int portBegin, + int portLength, int pathBegin, int pathLength, int queryBegin, int queryLength, + int refBegin, int refLength, boolean potentiallyDanglingMarkup, Parsed innerUrl) { + mSchemeBegin = schemeBegin; + mSchemeLength = schemeLength; + mUsernameBegin = usernameBegin; + mUsernameLength = usernameLength; + mPasswordBegin = passwordBegin; + mPasswordLength = passwordLength; + mHostBegin = hostBegin; + mHostLength = hostLength; + mPortBegin = portBegin; + mPortLength = portLength; + mPathBegin = pathBegin; + mPathLength = pathLength; + mQueryBegin = queryBegin; + mQueryLength = queryLength; + mRefBegin = refBegin; + mRefLength = refLength; + mPotentiallyDanglingMarkup = potentiallyDanglingMarkup; + mInnerUrl = innerUrl; + } + + /* package */ long toNativeParsed() { + long inner = 0; + if (mInnerUrl != null) { + inner = mInnerUrl.toNativeParsed(); + } + return ParsedJni.get().createNative(mSchemeBegin, mSchemeLength, mUsernameBegin, + mUsernameLength, mPasswordBegin, mPasswordLength, mHostBegin, mHostLength, + mPortBegin, mPortLength, mPathBegin, mPathLength, mQueryBegin, mQueryLength, + mRefBegin, mRefLength, mPotentiallyDanglingMarkup, inner); + } + + /* package */ String serialize() { + StringBuilder builder = new StringBuilder(); + builder.append(mSchemeBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mSchemeLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mUsernameBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mUsernameLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mPasswordBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mPasswordLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mHostBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mHostLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mPortBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mPortLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mPathBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mPathLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mQueryBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mQueryLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mRefBegin).append(GURL.SERIALIZER_DELIMITER); + builder.append(mRefLength).append(GURL.SERIALIZER_DELIMITER); + builder.append(mPotentiallyDanglingMarkup).append(GURL.SERIALIZER_DELIMITER); + builder.append(mInnerUrl != null); + if (mInnerUrl != null) { + builder.append(GURL.SERIALIZER_DELIMITER).append(mInnerUrl.serialize()); + } + return builder.toString(); + } + + /* package */ static Parsed deserialize(String[] tokens, int startIndex) { + int schemeBegin = Integer.parseInt(tokens[startIndex++]); + int schemeLength = Integer.parseInt(tokens[startIndex++]); + int usernameBegin = Integer.parseInt(tokens[startIndex++]); + int usernameLength = Integer.parseInt(tokens[startIndex++]); + int passwordBegin = Integer.parseInt(tokens[startIndex++]); + int passwordLength = Integer.parseInt(tokens[startIndex++]); + int hostBegin = Integer.parseInt(tokens[startIndex++]); + int hostLength = Integer.parseInt(tokens[startIndex++]); + int portBegin = Integer.parseInt(tokens[startIndex++]); + int portLength = Integer.parseInt(tokens[startIndex++]); + int pathBegin = Integer.parseInt(tokens[startIndex++]); + int pathLength = Integer.parseInt(tokens[startIndex++]); + int queryBegin = Integer.parseInt(tokens[startIndex++]); + int queryLength = Integer.parseInt(tokens[startIndex++]); + int refBegin = Integer.parseInt(tokens[startIndex++]); + int refLength = Integer.parseInt(tokens[startIndex++]); + boolean potentiallyDanglingMarkup = Boolean.parseBoolean(tokens[startIndex++]); + Parsed innerParsed = null; + if (Boolean.parseBoolean(tokens[startIndex++])) { + innerParsed = Parsed.deserialize(tokens, startIndex); + } + return new Parsed(schemeBegin, schemeLength, usernameBegin, usernameLength, passwordBegin, + passwordLength, hostBegin, hostLength, portBegin, portLength, pathBegin, pathLength, + queryBegin, queryLength, refBegin, refLength, potentiallyDanglingMarkup, + innerParsed); + } + + @NativeMethods + interface Natives { + /** + * Create and return the pointer to a native Parsed. + */ + long createNative(int schemeBegin, int schemeLength, int usernameBegin, int usernameLength, + int passwordBegin, int passwordLength, int hostBegin, int hostLength, int portBegin, + int portLength, int pathBegin, int pathLength, int queryBegin, int queryLength, + int refBegin, int refLength, boolean potentiallyDanglingMarkup, long innerUrl); + } +} diff --git a/android/java/src/org/chromium/url/URI.java b/android/java/src/org/chromium/url/URI.java new file mode 100644 index 00000000000..e83d6157791 --- /dev/null +++ b/android/java/src/org/chromium/url/URI.java @@ -0,0 +1,61 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import java.net.URISyntaxException; + +/** + * An API shim around GURL that mostly matches the java.net.URI API. + * + * @deprecated Please use GURL directly in new code. + */ +@Deprecated +public class URI extends GURL { + /** + * Create a new GURL with a java.net.URI API shim. + */ + public URI(String uri) throws URISyntaxException { + super(uri); + if (!isValid()) { + throw new URISyntaxException(uri, "Uri could not be parsed as a valid GURL"); + } + } + + private URI() {} + + /** + * This function is a convenience wrapper around {@link URI#URI(String)}, that wraps the thrown + * thrown URISyntaxException in an IllegalArgumentException and throws that instead. + */ + public static URI create(String str) { + try { + return new URI(str); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e); + } + } + + @Override + public URI getOrigin() { + URI target = new URI(); + getOriginInternal(target); + return target; + } + + /** See {@link GURL#getRef()} */ + public String getFragment() { + return getRef(); + } + + /** See {@link java.net.URI#isAbsolute()} */ + public boolean isAbsolute() { + return !getScheme().isEmpty(); + } + + @Override + public String toString() { + return getPossiblyInvalidSpec(); + } +} diff --git a/android/javatests/DEPS b/android/javatests/DEPS new file mode 100644 index 00000000000..aa935913119 --- /dev/null +++ b/android/javatests/DEPS @@ -0,0 +1,3 @@ +include_rules = [ + "+content/public/test/android", +] diff --git a/android/javatests/src/org/chromium/url/GURLJavaTest.java b/android/javatests/src/org/chromium/url/GURLJavaTest.java new file mode 100644 index 00000000000..e684e510338 --- /dev/null +++ b/android/javatests/src/org/chromium/url/GURLJavaTest.java @@ -0,0 +1,314 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; + +import androidx.test.filters.SmallTest; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import org.chromium.base.test.BaseJUnit4ClassRunner; +import org.chromium.base.test.util.Batch; +import org.chromium.content_public.browser.test.NativeLibraryTestUtils; + +import java.net.URISyntaxException; + +/** + * Tests for {@link GURL}. GURL relies heavily on the native implementation, and the lion's share of + * the logic is tested there. This test is primarily to make sure everything is plumbed through + * correctly. + */ +@RunWith(BaseJUnit4ClassRunner.class) +@Batch(Batch.UNIT_TESTS) +public class GURLJavaTest { + @Mock + GURL.Natives mGURLMocks; + + @Before + public void setUp() { + MockitoAnnotations.initMocks(this); + + NativeLibraryTestUtils.loadNativeLibraryNoBrowserProcess(); + GURLJavaTestHelper.nativeInitializeICU(); + } + + /* package */ static void deepAssertEquals(GURL expected, GURL actual) { + Assert.assertEquals(expected, actual); + Assert.assertEquals(expected.getScheme(), actual.getScheme()); + Assert.assertEquals(expected.getUsername(), actual.getUsername()); + Assert.assertEquals(expected.getPassword(), actual.getPassword()); + Assert.assertEquals(expected.getHost(), actual.getHost()); + Assert.assertEquals(expected.getPort(), actual.getPort()); + Assert.assertEquals(expected.getPath(), actual.getPath()); + Assert.assertEquals(expected.getQuery(), actual.getQuery()); + Assert.assertEquals(expected.getRef(), actual.getRef()); + } + + private String prependLengthToSerialization(String serialization) { + return Integer.toString(serialization.length()) + GURL.SERIALIZER_DELIMITER + serialization; + } + + @SmallTest + @Test + public void testGURLEquivalence() { + GURLJavaTestHelper.nativeTestGURLEquivalence(); + } + + // Equivalent of GURLTest.Components + @SmallTest + @Test + @SuppressWarnings(value = "AuthLeak") + public void testComponents() { + GURL empty = new GURL(""); + Assert.assertTrue(empty.isEmpty()); + Assert.assertFalse(empty.isValid()); + + GURL url = new GURL("http://user:pass@google.com:99/foo;bar?q=a#ref"); + Assert.assertFalse(url.isEmpty()); + Assert.assertTrue(url.isValid()); + Assert.assertTrue(url.getScheme().equals("http")); + + Assert.assertEquals("http://user:pass@google.com:99/foo;bar?q=a#ref", url.getSpec()); + + Assert.assertEquals("http", url.getScheme()); + Assert.assertEquals("user", url.getUsername()); + Assert.assertEquals("pass", url.getPassword()); + Assert.assertEquals("google.com", url.getHost()); + Assert.assertEquals("99", url.getPort()); + Assert.assertEquals("/foo;bar", url.getPath()); + Assert.assertEquals("q=a", url.getQuery()); + Assert.assertEquals("ref", url.getRef()); + + // Test parsing userinfo with special characters. + GURL urlSpecialPass = new GURL("http://user:%40!$&'()*+,;=:@google.com:12345"); + Assert.assertTrue(urlSpecialPass.isValid()); + // GURL canonicalizes some delimiters. + Assert.assertEquals("%40!$&%27()*+,%3B%3D%3A", urlSpecialPass.getPassword()); + Assert.assertEquals("google.com", urlSpecialPass.getHost()); + Assert.assertEquals("12345", urlSpecialPass.getPort()); + } + + // Equivalent of GURLTest.Empty + @SmallTest + @Test + public void testEmpty() { + GURLJni.TEST_HOOKS.setInstanceForTesting(mGURLMocks); + doThrow(new RuntimeException("Should not need to parse empty URL")) + .when(mGURLMocks) + .init(any(), any()); + GURL url = new GURL(""); + Assert.assertFalse(url.isValid()); + Assert.assertEquals("", url.getSpec()); + + Assert.assertEquals("", url.getScheme()); + Assert.assertEquals("", url.getUsername()); + Assert.assertEquals("", url.getPassword()); + Assert.assertEquals("", url.getHost()); + Assert.assertEquals("", url.getPort()); + Assert.assertEquals("", url.getPath()); + Assert.assertEquals("", url.getQuery()); + Assert.assertEquals("", url.getRef()); + GURLJni.TEST_HOOKS.setInstanceForTesting(null); + } + + // Test that GURL and URI return the correct Origin. + @SmallTest + @Test + @SuppressWarnings(value = "AuthLeak") + public void testOrigin() throws URISyntaxException { + final String kExpectedOrigin1 = "http://google.com:21/"; + final String kExpectedOrigin2 = ""; + GURL url1 = new GURL("filesystem:http://user:pass@google.com:21/blah#baz"); + GURL url2 = new GURL("javascript:window.alert(\"hello,world\");"); + URI uri = new URI("filesystem:http://user:pass@google.com:21/blah#baz"); + + Assert.assertEquals(kExpectedOrigin1, url1.getOrigin().getSpec()); + Assert.assertEquals(kExpectedOrigin2, url2.getOrigin().getSpec()); + URI origin = uri.getOrigin(); + Assert.assertEquals(kExpectedOrigin1, origin.getSpec()); + } + + @SmallTest + @Test + public void testWideInput() throws URISyntaxException { + final String kExpectedSpec = "http://xn--1xa.com/"; + + GURL url = new GURL("http://\u03C0.com"); + Assert.assertEquals(kExpectedSpec, url.getSpec()); + Assert.assertEquals("http", url.getScheme()); + Assert.assertEquals("", url.getUsername()); + Assert.assertEquals("", url.getPassword()); + Assert.assertEquals("xn--1xa.com", url.getHost()); + Assert.assertEquals("", url.getPort()); + Assert.assertEquals("/", url.getPath()); + Assert.assertEquals("", url.getQuery()); + Assert.assertEquals("", url.getRef()); + } + + @SmallTest + @Test + @SuppressWarnings(value = "AuthLeak") + public void testSerialization() { + GURL cases[] = { + // Common Standard URLs. + new GURL("https://www.google.com"), + new GURL("https://www.google.com/"), + new GURL("https://www.google.com/maps.htm"), + new GURL("https://www.google.com/maps/"), + new GURL("https://www.google.com/index.html"), + new GURL("https://www.google.com/index.html?q=maps"), + new GURL("https://www.google.com/index.html#maps/"), + new GURL("https://foo:bar@www.google.com/maps.htm"), + new GURL("https://www.google.com/maps/au/index.html"), + new GURL("https://www.google.com/maps/au/north"), + new GURL("https://www.google.com/maps/au/north/"), + new GURL("https://www.google.com/maps/au/index.html?q=maps#fragment/"), + new GURL("http://www.google.com:8000/maps/au/index.html?q=maps#fragment/"), + new GURL("https://www.google.com/maps/au/north/?q=maps#fragment"), + new GURL("https://www.google.com/maps/au/north?q=maps#fragment"), + // Less common standard URLs. + new GURL("filesystem:http://www.google.com/temporary/bar.html?baz=22"), + new GURL("file:///temporary/bar.html?baz=22"), + new GURL("ftp://foo/test/index.html"), + new GURL("gopher://foo/test/index.html"), + new GURL("ws://foo/test/index.html"), + // Non-standard, + new GURL("chrome://foo/bar.html"), + new GURL("httpa://foo/test/index.html"), + new GURL("blob:https://foo.bar/test/index.html"), + new GURL("about:blank"), + new GURL("data:foobar"), + new GURL("scheme:opaque_data"), + // Invalid URLs. + new GURL("foobar"), + // URLs containing the delimiter + new GURL("https://www.google.ca/" + GURL.SERIALIZER_DELIMITER + ",foo"), + new GURL("https://www.foo" + GURL.SERIALIZER_DELIMITER + "bar.com"), + }; + + GURLJni.TEST_HOOKS.setInstanceForTesting(mGURLMocks); + doThrow(new RuntimeException("Should not re-initialize for deserialization when the " + + "version hasn't changed.")) + .when(mGURLMocks) + .init(any(), any()); + for (GURL url : cases) { + GURL out = GURL.deserialize(url.serialize()); + deepAssertEquals(url, out); + } + GURLJni.TEST_HOOKS.setInstanceForTesting(null); + } + + /** + * Tests that we re-parse the URL from the spec, which must always be the last token in the + * serialization, if the serialization version differs. + */ + @SmallTest + @Test + public void testSerializationWithVersionSkew() { + GURL url = new GURL("https://www.google.com"); + String serialization = (GURL.SERIALIZER_VERSION + 1) + + ",0,0,0,0,foo,https://url.bad,blah,0,".replace(',', GURL.SERIALIZER_DELIMITER) + + url.getSpec(); + serialization = prependLengthToSerialization(serialization); + GURL out = GURL.deserialize(serialization); + deepAssertEquals(url, out); + } + + /** + * Tests that fields that aren't visible to java code are correctly serialized. + */ + @SmallTest + @Test + public void testSerializationOfPrivateFields() { + String serialization = GURL.SERIALIZER_VERSION + + ",true," + // Outer Parsed. + + "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,false,true," + // Inner Parsed. + + "17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,true,false," + + "chrome://foo/bar.html"; + serialization = serialization.replace(',', GURL.SERIALIZER_DELIMITER); + serialization = prependLengthToSerialization(serialization); + GURL url = GURL.deserialize(serialization); + Assert.assertEquals(url.serialize(), serialization); + } + + /** + * Tests serialized GURL truncated by storage. + */ + @SmallTest + @Test + public void testTruncatedDeserialization() { + String serialization = "123,1,true,1,2,3,4,5,6,7,8,9,10"; + serialization = serialization.replace(',', GURL.SERIALIZER_DELIMITER); + GURL url = GURL.deserialize(serialization); + Assert.assertEquals(url, GURL.emptyGURL()); + } + + /** + * Tests serialized GURL truncated by storage. + */ + @SmallTest + @Test + public void testCorruptedSerializations() { + String serialization = new GURL("https://www.google.ca").serialize(); + // Replace the scheme length (5) with an extra delimiter. + String corruptedParsed = serialization.replace('5', GURL.SERIALIZER_DELIMITER); + GURL url = GURL.deserialize(corruptedParsed); + Assert.assertEquals(GURL.emptyGURL(), url); + + String corruptedVersion = + serialization.replaceFirst(Integer.toString(GURL.SERIALIZER_VERSION), "x"); + url = GURL.deserialize(corruptedVersion); + Assert.assertEquals(GURL.emptyGURL(), url); + } + + // Test that domainIs is hooked up correctly. + @SmallTest + @Test + public void testDomainIs() { + GURL url1 = new GURL("https://www.google.com"); + GURL url2 = new GURL("https://www.notgoogle.com"); + + Assert.assertTrue(url1.domainIs("com")); + Assert.assertTrue(url2.domainIs("com")); + Assert.assertTrue(url1.domainIs("google.com")); + Assert.assertFalse(url2.domainIs("google.com")); + + Assert.assertTrue(url1.domainIs("www.google.com")); + Assert.assertFalse(url1.domainIs("images.google.com")); + } + + // Tests Mojom conversion. + @SmallTest + @Test + public void testMojomConvertion() { + // Valid: + Assert.assertEquals( + "https://www.google.com/", new GURL("https://www.google.com/").toMojom().url); + + // Null: + Assert.assertEquals("", new GURL(null).toMojom().url); + + // Empty: + Assert.assertEquals("", new GURL("").toMojom().url); + + // Invalid: + Assert.assertEquals("", new GURL(new String(new byte[] {1, 1, 1})).toMojom().url); + + // Too long. + Assert.assertEquals("", + new GURL("https://www.google.com/".concat("a".repeat(2 * 1024 * 1024))) + .toMojom() + .url); + } +} diff --git a/android/javatests/src/org/chromium/url/GURLJavaTestHelper.java b/android/javatests/src/org/chromium/url/GURLJavaTestHelper.java new file mode 100644 index 00000000000..975b009dc33 --- /dev/null +++ b/android/javatests/src/org/chromium/url/GURLJavaTestHelper.java @@ -0,0 +1,34 @@ +// Copyright 2020 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import org.chromium.base.annotations.CalledByNative; +import org.chromium.base.annotations.JNINamespace; +import org.chromium.base.annotations.NativeMethods; + +/** + * Helpers for GURLJavaTest that need to call into native code. + */ +@JNINamespace("url") +public class GURLJavaTestHelper { + @CalledByNative + public static GURL createGURL(String uri) { + return new GURL(uri); + } + + public static void nativeInitializeICU() { + GURLJavaTestHelperJni.get().initializeICU(); + } + + public static void nativeTestGURLEquivalence() { + GURLJavaTestHelperJni.get().testGURLEquivalence(); + } + + @NativeMethods + interface Natives { + void initializeICU(); + void testGURLEquivalence(); + } +} diff --git a/android/javatests/src/org/chromium/url/JUnitTestGURLsTest.java b/android/javatests/src/org/chromium/url/JUnitTestGURLsTest.java new file mode 100644 index 00000000000..a23967c2496 --- /dev/null +++ b/android/javatests/src/org/chromium/url/JUnitTestGURLsTest.java @@ -0,0 +1,73 @@ +// Copyright 2020 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; + +import androidx.test.filters.SmallTest; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import org.chromium.base.Log; +import org.chromium.base.test.BaseJUnit4ClassRunner; +import org.chromium.base.test.util.Batch; + +import java.util.Map; + +/** + * Tests for JUnitTestGURLs. + */ +@RunWith(BaseJUnit4ClassRunner.class) +@Batch(Batch.UNIT_TESTS) +public class JUnitTestGURLsTest { + private static final String TAG = "JUnitTestGURLs"; + + @Mock + GURL.Natives mGURLMocks; + + @Before + public void setUp() { + MockitoAnnotations.initMocks(this); + } + + private RuntimeException getErrorForGURL(GURL gurl) { + String serialized = gurl.serialize(); + Assert.assertEquals(-1, serialized.indexOf(",")); + serialized = serialized.replace(GURL.SERIALIZER_DELIMITER, ','); + + return new RuntimeException("Please update the serialization in JUnitTestGURLs.java for " + + gurl.getPossiblyInvalidSpec() + " to: '" + serialized + "'"); + } + + @SmallTest + @Test + public void testGURLEquivalence() throws Throwable { + doThrow(new RuntimeException("Deserialization required re-initialization.")) + .when(mGURLMocks) + .init(any(), any()); + + Throwable exception = null; + for (Map.Entry entry : JUnitTestGURLs.sGURLMap.entrySet()) { + GURL gurl = new GURL(entry.getKey()); + try { + GURLJni.TEST_HOOKS.setInstanceForTesting(mGURLMocks); + GURL deserialized = JUnitTestGURLs.getGURL(entry.getKey()); + GURLJni.TEST_HOOKS.setInstanceForTesting(null); + GURLJavaTest.deepAssertEquals(deserialized, gurl); + } catch (Throwable e) { + GURLJni.TEST_HOOKS.setInstanceForTesting(null); + exception = getErrorForGURL(gurl); + Log.e(TAG, "Error: ", exception); + } + } + if (exception != null) throw exception; + } +} diff --git a/android/javatests/src/org/chromium/url/OriginJavaTest.java b/android/javatests/src/org/chromium/url/OriginJavaTest.java new file mode 100644 index 00000000000..3a4665af7b5 --- /dev/null +++ b/android/javatests/src/org/chromium/url/OriginJavaTest.java @@ -0,0 +1,99 @@ +// Copyright 2022 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import androidx.test.filters.SmallTest; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; + +import org.chromium.base.test.BaseJUnit4ClassRunner; +import org.chromium.base.test.util.Batch; +import org.chromium.content_public.browser.test.NativeLibraryTestUtils; +import org.chromium.mojo_base.mojom.UnguessableToken; + +/** + * Tests for {@link Origin}. Origin relies heavily on the native implementation, and the lion's + * share of the logic is tested there. This test is primarily to make sure everything is plumbed + * through correctly. + */ +@RunWith(BaseJUnit4ClassRunner.class) +@Batch(Batch.UNIT_TESTS) +public class OriginJavaTest { + @Before + public void setUp() { + NativeLibraryTestUtils.loadNativeLibraryNoBrowserProcess(); + } + + @SmallTest + @Test + public void testOriginEquivalence() { + OriginJavaTestHelper.testOriginEquivalence(); + } + + @SmallTest + @Test + public void testCreateOpaqueOrigin() { + Origin opaque = Origin.createOpaqueOrigin(); + Assert.assertTrue(opaque.isOpaque()); + Assert.assertEquals("", opaque.getScheme()); + Assert.assertEquals("", opaque.getHost()); + Assert.assertEquals(0, opaque.getPort()); + } + + @SmallTest + @Test + public void testNonOpaqueMojomConstructor() { + String scheme = "http"; + String host = "host.name"; + short port = 42; + org.chromium.url.internal.mojom.Origin mojom = new org.chromium.url.internal.mojom.Origin(); + mojom.scheme = scheme; + mojom.host = host; + mojom.port = port; + Origin origin = new Origin(mojom); + + Assert.assertEquals(scheme, origin.getScheme()); + Assert.assertEquals(host, origin.getHost()); + Assert.assertEquals(port, origin.getPort()); + Assert.assertFalse(origin.isOpaque()); + } + + @SmallTest + @Test + public void testOpaqueMojomConstructor() { + String scheme = "http"; + String host = "host.name"; + short port = 42; + org.chromium.url.internal.mojom.Origin mojom = new org.chromium.url.internal.mojom.Origin(); + mojom.scheme = scheme; + mojom.host = host; + mojom.port = port; + UnguessableToken token = new UnguessableToken(); + token.high = 3; + token.low = 4; + mojom.nonceIfOpaque = token; + + Origin origin = new Origin(mojom); + + Assert.assertEquals("", origin.getScheme()); + Assert.assertEquals("", origin.getHost()); + Assert.assertEquals(0, origin.getPort()); + Assert.assertTrue(origin.isOpaque()); + } + + @SmallTest + @Test + public void testCreateFromGURL() { + GURL gurl = new GURL("https://host.name:61234/path"); + Origin opaque = Origin.create(gurl); + Assert.assertFalse(opaque.isOpaque()); + Assert.assertEquals("https", opaque.getScheme()); + Assert.assertEquals("host.name", opaque.getHost()); + Assert.assertEquals(61234, opaque.getPort()); + } +} diff --git a/android/javatests/src/org/chromium/url/OriginJavaTestHelper.java b/android/javatests/src/org/chromium/url/OriginJavaTestHelper.java new file mode 100644 index 00000000000..2eb9550ba7a --- /dev/null +++ b/android/javatests/src/org/chromium/url/OriginJavaTestHelper.java @@ -0,0 +1,23 @@ +// Copyright 2022 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import org.chromium.base.annotations.JNINamespace; +import org.chromium.base.annotations.NativeMethods; + +/** + * Helpers for OriginJavaTest that need to call into native code. + */ +@JNINamespace("url") +public class OriginJavaTestHelper { + public static void testOriginEquivalence() { + OriginJavaTestHelperJni.get().testOriginEquivalence(); + } + + @NativeMethods + interface Natives { + void testOriginEquivalence(); + } +} diff --git a/android/junit/src/org/chromium/url/ShadowGURLTest.java b/android/junit/src/org/chromium/url/ShadowGURLTest.java new file mode 100644 index 00000000000..a491de1a396 --- /dev/null +++ b/android/junit/src/org/chromium/url/ShadowGURLTest.java @@ -0,0 +1,70 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.robolectric.annotation.Config; + +import org.chromium.base.test.BaseRobolectricTestRunner; + +/** Tests of {@link ShadowGURL}. */ +@RunWith(BaseRobolectricTestRunner.class) +@Config(shadows = {ShadowGURL.class}) +public class ShadowGURLTest { + /* package */ static void deepAssertEquals(GURL expected, GURL actual) { + Assert.assertEquals(expected, actual); + Assert.assertEquals(expected.getScheme(), actual.getScheme()); + Assert.assertEquals(expected.getUsername(), actual.getUsername()); + Assert.assertEquals(expected.getPassword(), actual.getPassword()); + Assert.assertEquals(expected.getHost(), actual.getHost()); + Assert.assertEquals(expected.getPort(), actual.getPort()); + Assert.assertEquals(expected.getPath(), actual.getPath()); + Assert.assertEquals(expected.getQuery(), actual.getQuery()); + Assert.assertEquals(expected.getRef(), actual.getRef()); + } + + @Test + public void testComponents() { + GURL url = new GURL(JUnitTestGURLs.SEARCH_URL); + Assert.assertFalse(url.isEmpty()); + Assert.assertTrue(url.isValid()); + + Assert.assertEquals(JUnitTestGURLs.SEARCH_URL, url.getSpec()); + Assert.assertEquals("https", url.getScheme()); + Assert.assertEquals("", url.getUsername()); + Assert.assertEquals("", url.getPassword()); + Assert.assertEquals("www.google.com", url.getHost()); + Assert.assertEquals("", url.getPort()); + Assert.assertEquals("/search", url.getPath()); + Assert.assertEquals("q=test", url.getQuery()); + Assert.assertEquals("", url.getRef()); + } + + @Test + public void testEmpty() { + GURL url = new GURL(""); + Assert.assertFalse(url.isValid()); + + Assert.assertEquals("", url.getSpec()); + Assert.assertEquals("", url.getScheme()); + Assert.assertEquals("", url.getUsername()); + Assert.assertEquals("", url.getPassword()); + Assert.assertEquals("", url.getHost()); + Assert.assertEquals("", url.getPort()); + Assert.assertEquals("", url.getPath()); + Assert.assertEquals("", url.getQuery()); + Assert.assertEquals("", url.getRef()); + } + + @Test + public void testSerialization() { + GURL gurl = new GURL(JUnitTestGURLs.URL_1_WITH_PATH); + GURL deserialized = GURL.deserialize(gurl.serialize()); + + deepAssertEquals(deserialized, gurl); + } +} diff --git a/android/origin_android.cc b/android/origin_android.cc new file mode 100644 index 00000000000..a0dd271b5ad --- /dev/null +++ b/android/origin_android.cc @@ -0,0 +1,87 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/origin.h" + +#include + +#include "base/android/jni_android.h" +#include "base/android/jni_string.h" +#include "base/android/scoped_java_ref.h" +#include "base/memory/ptr_util.h" +#include "url/android/gurl_android.h" +#include "url/url_jni_headers/Origin_jni.h" + +namespace url { + +base::android::ScopedJavaLocalRef Origin::CreateJavaObject() const { + JNIEnv* env = base::android::AttachCurrentThread(); + const base::UnguessableToken* token = Origin::GetNonceForSerialization(); + return Java_Origin_Constructor( + env, base::android::ConvertUTF8ToJavaString(env, tuple_.scheme()), + base::android::ConvertUTF8ToJavaString(env, tuple_.host()), tuple_.port(), + opaque(), token ? token->GetHighForSerialization() : 0, + token ? token->GetLowForSerialization() : 0); +} + +// static +Origin Origin::FromJavaObject( + const base::android::JavaRef& java_origin) { + JNIEnv* env = base::android::AttachCurrentThread(); + std::unique_ptr origin = base::WrapUnique( + reinterpret_cast(Java_Origin_toNativeOrigin(env, java_origin))); + return std::move(*origin); +} + +// static +jlong Origin::CreateNative(JNIEnv* env, + const base::android::JavaRef& java_scheme, + const base::android::JavaRef& java_host, + uint16_t port, + bool is_opaque, + uint64_t token_high_bits, + uint64_t token_low_bits) { + const std::string& scheme = ConvertJavaStringToUTF8(env, java_scheme); + const std::string& host = ConvertJavaStringToUTF8(env, java_host); + + absl::optional nonce_token = + base::UnguessableToken::Deserialize(token_high_bits, token_low_bits); + bool has_nonce = nonce_token.has_value(); + CHECK(has_nonce == is_opaque); + Origin::Nonce nonce; + if (has_nonce) { + nonce = Origin::Nonce(nonce_token.value()); + } + Origin origin = is_opaque + ? Origin::CreateOpaqueFromNormalizedPrecursorTuple( + scheme, host, port, nonce) + : Origin::CreateFromNormalizedTuple(scheme, host, port); + return reinterpret_cast(new Origin(origin)); +} + +static base::android::ScopedJavaLocalRef JNI_Origin_CreateOpaque( + JNIEnv* env) { + return Origin().CreateJavaObject(); +} + +static base::android::ScopedJavaLocalRef JNI_Origin_CreateFromGURL( + JNIEnv* env, + const base::android::JavaParamRef& j_gurl) { + return Origin::Create(*GURLAndroid::ToNativeGURL(env, j_gurl)) + .CreateJavaObject(); +} + +static jlong JNI_Origin_CreateNative( + JNIEnv* env, + const base::android::JavaParamRef& java_scheme, + const base::android::JavaParamRef& java_host, + jshort port, + jboolean is_opaque, + jlong token_high_bits, + jlong token_low_bits) { + return Origin::CreateNative(env, java_scheme, java_host, port, is_opaque, + token_high_bits, token_low_bits); +} + +} // namespace url diff --git a/android/origin_java_test_helper.cc b/android/origin_java_test_helper.cc new file mode 100644 index 00000000000..62554d87de1 --- /dev/null +++ b/android/origin_java_test_helper.cc @@ -0,0 +1,37 @@ +// Copyright 2022 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "base/android/jni_android.h" +#include "base/android/jni_string.h" +#include "url/gurl.h" +#include "url/j_test_jni_headers/OriginJavaTestHelper_jni.h" +#include "url/origin.h" + +namespace url { + +static void JNI_OriginJavaTestHelper_TestOriginEquivalence(JNIEnv* env) { + Origin cases[] = { + Origin(), + Origin::Create(GURL("http://a.com")), + Origin::Create(GURL("http://a.com:8000")), + Origin::Create(GURL("scheme:host")), + Origin::Create(GURL("http://a.com:8000")).DeriveNewOpaqueOrigin(), + }; + for (const Origin& origin : cases) { + base::android::ScopedJavaLocalRef j_origin = + origin.CreateJavaObject(); + Origin sameOrigin = Origin::FromJavaObject(j_origin); + if (origin != sameOrigin) { + std::stringstream ss; + ss << "Origin not equivalent: " << origin << ", " << sameOrigin; + env->ThrowNew(env->FindClass("java/lang/AssertionError"), + ss.str().data()); + return; + } + } +} + +} // namespace url diff --git a/android/parsed_android.cc b/android/parsed_android.cc new file mode 100644 index 00000000000..36d8aa255ef --- /dev/null +++ b/android/parsed_android.cc @@ -0,0 +1,96 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/android/parsed_android.h" + +#include + +#include "base/android/jni_android.h" +#include "url/gurl_jni_headers/Parsed_jni.h" + +using base::android::AttachCurrentThread; +using base::android::JavaRef; +using base::android::ScopedJavaLocalRef; + +namespace url { + +namespace { + +ScopedJavaLocalRef CreateJavaParsed(JNIEnv* env, + const Parsed& parsed, + const JavaRef& inner) { + static constexpr bool is_signed = + std::is_signed::value; + static constexpr size_t offset_size = sizeof(parsed.scheme.begin); + static_assert((is_signed && sizeof(jint) >= offset_size) || + (!is_signed && sizeof(jint) > offset_size), + "Java size offsets for Parsed Components must be large enough " + "to store the full C++ offset."); + return Java_Parsed_Constructor( + env, parsed.scheme.begin, parsed.scheme.len, parsed.username.begin, + parsed.username.len, parsed.password.begin, parsed.password.len, + parsed.host.begin, parsed.host.len, parsed.port.begin, parsed.port.len, + parsed.path.begin, parsed.path.len, parsed.query.begin, parsed.query.len, + parsed.ref.begin, parsed.ref.len, parsed.potentially_dangling_markup, + inner); +} + +} // namespace + +// static +ScopedJavaLocalRef ParsedAndroid::InitFromParsed( + JNIEnv* env, + const Parsed& parsed) { + ScopedJavaLocalRef inner; + if (parsed.inner_parsed()) + inner = CreateJavaParsed(env, *parsed.inner_parsed(), nullptr); + return CreateJavaParsed(env, parsed, inner); +} + +static jlong JNI_Parsed_CreateNative(JNIEnv* env, + jint scheme_begin, + jint scheme_length, + jint username_begin, + jint username_length, + jint password_begin, + jint password_length, + jint host_begin, + jint host_length, + jint port_begin, + jint port_length, + jint path_begin, + jint path_length, + jint query_begin, + jint query_length, + jint ref_begin, + jint ref_length, + jboolean potentially_dangling_markup, + jlong inner_parsed) { + Parsed* parsed = new Parsed(); + parsed->scheme.begin = scheme_begin; + parsed->scheme.len = scheme_length; + parsed->username.begin = username_begin; + parsed->username.len = username_length; + parsed->password.begin = password_begin; + parsed->password.len = password_length; + parsed->host.begin = host_begin; + parsed->host.len = host_length; + parsed->port.begin = port_begin; + parsed->port.len = port_length; + parsed->path.begin = path_begin; + parsed->path.len = path_length; + parsed->query.begin = query_begin; + parsed->query.len = query_length; + parsed->ref.begin = ref_begin; + parsed->ref.len = ref_length; + parsed->potentially_dangling_markup = potentially_dangling_markup; + Parsed* inner = reinterpret_cast(inner_parsed); + if (inner) { + parsed->set_inner_parsed(*inner); + delete inner; + } + return reinterpret_cast(parsed); +} + +} // namespace url diff --git a/android/parsed_android.h b/android/parsed_android.h new file mode 100644 index 00000000000..244ada55f4b --- /dev/null +++ b/android/parsed_android.h @@ -0,0 +1,22 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_ANDROID_PARSED_ANDROID_H_ +#define URL_ANDROID_PARSED_ANDROID_H_ + +#include "base/android/scoped_java_ref.h" +#include "url/third_party/mozilla/url_parse.h" + +namespace url { + +class ParsedAndroid { + public: + static base::android::ScopedJavaLocalRef InitFromParsed( + JNIEnv* env, + const Parsed& parsed); +}; + +} // namespace url + +#endif // URL_ANDROID_PARSED_ANDROID_H_ diff --git a/android/robolectric_test_main.cc b/android/robolectric_test_main.cc new file mode 100644 index 00000000000..28fb4d2410d --- /dev/null +++ b/android/robolectric_test_main.cc @@ -0,0 +1,15 @@ +// Copyright 2022 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include + +#include "base/android/base_jni_onload.h" +#include "base/android/jni_android.h" + +extern "C" JNI_EXPORT jint JNI_OnLoad(JavaVM* vm, void* reserved) { + base::android::InitVM(vm); + base::android::OnJNIOnLoadInit(); + // TODO(1223993): Initialize GURL schemes, like in + // content::RegisterContentSchemes(). + return JNI_VERSION_1_4; +} diff --git a/android/test/java/src/org/chromium/url/JUnitTestGURLs.java b/android/test/java/src/org/chromium/url/JUnitTestGURLs.java new file mode 100644 index 00000000000..9f19c6c9079 --- /dev/null +++ b/android/test/java/src/org/chromium/url/JUnitTestGURLs.java @@ -0,0 +1,174 @@ +// Copyright 2020 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * A Helper class for JUnit tests to be able to use GURLs without requiring native initialization. + * This should be used sparingly, when converting junit tests to Batched Instrumentation tests is + * not feasible. + * + * If any more complex GURL behaviour is tested, like comparing Origins, the test should be written + * as an Instrumentation test instead - you should never mock GURL. + */ +public class JUnitTestGURLs { + // In order to add a test URL: + // 1. Add the URL String as a constant here. + // 2. Add the constant to the map below, with a placeholder string for the GURL serialization. + // 3. Run JUnitTestGURLsTest (eg. './tools/autotest.py -C out/Debug JUnitTestGURLsTest'). + // 4. Check logcat output or test exception for the correct serialization String, and place it + // in the map. + public static final String EXAMPLE_URL = "https://www.example.com/"; + public static final String HTTP_URL = "http://www.example.com/"; + public static final String URL_1 = "https://www.one.com/"; + public static final String URL_1_NUMERAL = "https://www.1.com/"; + public static final String URL_1_WITH_PATH = "https://www.one.com/some_path.html"; + public static final String URL_2 = "https://www.two.com/"; + public static final String URL_3 = "https://www.three.com/"; + public static final String MAPS_URL = "https://maps.google.com/"; + public static final String SEARCH_URL = "https://www.google.com/search?q=test"; + public static final String SEARCH_2_URL = "https://www.google.com/search?q=query"; + public static final String INITIAL_URL = "https://initial.com"; + public static final String SPECULATED_URL = "https://speculated.com"; + public static final String NTP_URL = "chrome://newtab/"; + public static final String NTP_NATIVE_URL = "chrome-native://newtab/"; + public static final String DOM_DISILLER_URL = "chrome-distiller://url"; + public static final String RED_1 = "https://www.red.com/page1"; + public static final String RED_2 = "https://www.red.com/page2"; + public static final String RED_3 = "https://www.red.com/page3"; + public static final String BLUE_1 = "https://www.blue.com/page1"; + public static final String BLUE_2 = "https://www.blue.com/page2"; + public static final String BLUE_3 = "https://www.blue.com/page3"; + public static final String AMP_URL = + "https://www.google.com/amp/www.nyt.com/ampthml/blogs.html"; + public static final String AMP_CACHE_URL = + "https://www.google.com/amp/s/www.nyt.com/ampthml/blogs.html"; + public static final String TEXT_FRAGMENT_URL = "https://www.example.com/#:~:text=selector"; + public static final String MULTI_TEXT_FRAGMENT_URL = + "https://www.example.com/#:~:text=selector1&text=selector2&text=selector3"; + public static final String INVALID_URL = "http://0x100.0/"; + public static final String GOOGLE_URL = "http://www.google.com/"; + public static final String GOOGLE_URL_DOGS = "http://www.google.com/dogs"; + public static final String GOOGLE_URL_DOGS_FUN = "http://www.google.com/dogs-are-fun"; + public static final String GOOGLE_URL_DOG = "http://www.google.com/dog"; + public static final String GOOGLE_URL_CAT = "http://www.google.com/cat"; + public static final String GOOGLE_URL_PIG = "http://www.google.com/pig"; + public static final String ABOUT_BLANK = "about:blank"; + public static final String CHROME_ABOUT = "chrome://about"; + + // Map of URL string to GURL serialization. + /* package */ static final Map sGURLMap; + static { + Map map = new HashMap<>(); + map.put(EXAMPLE_URL, + "82,1,true,0,5,0,-1,0,-1,8,15,0,-1,23,1,0,-1,0,-1," + + "false,false,https://www.example.com/"); + map.put(HTTP_URL, + "81,1,true,0,4,0,-1,0,-1,7,15,0,-1,22,1,0,-1,0,-1," + + "false,false,http://www.example.com/"); + map.put(URL_1, + "78,1,true,0,5,0,-1,0,-1,8,11,0,-1,19,1,0,-1,0,-1," + + "false,false,https://www.one.com/"); + map.put(URL_1_NUMERAL, + "75,1,true,0,5,0,-1,0,-1,8,9,0,-1,17,1,0,-1,0,-1," + + "false,false,https://www.1.com/"); + map.put(URL_1_WITH_PATH, + "93,1,true,0,5,0,-1,0,-1,8,11,0,-1,19,15,0,-1,0,-1," + + "false,false,https://www.one.com/some_path.html"); + map.put(URL_2, + "78,1,true,0,5,0,-1,0,-1,8,11,0,-1,19,1,0,-1,0,-1," + + "false,false,https://www.two.com/"); + map.put(URL_3, + "80,1,true,0,5,0,-1,0,-1,8,13,0,-1,21,1,0,-1,0,-1,false,false,https://www.three.com/"); + map.put(RED_1, + "83,1,true,0,5,0,-1,0,-1,8,11,0,-1,19,6,0,-1,0,-1," + + "false,false,https://www.red.com/page1"); + map.put(RED_2, + "83,1,true,0,5,0,-1,0,-1,8,11,0,-1,19,6,0,-1,0,-1," + + "false,false,https://www.red.com/page2"); + map.put(RED_3, + "83,1,true,0,5,0,-1,0,-1,8,11,0,-1,19,6,0,-1,0,-1," + + "false,false,https://www.red.com/page3"); + map.put(BLUE_1, + "84,1,true,0,5,0,-1,0,-1,8,12,0,-1,20,6,0,-1,0,-1," + + "false,false,https://www.blue.com/page1"); + map.put(BLUE_2, + "84,1,true,0,5,0,-1,0,-1,8,12,0,-1,20,6,0,-1,0,-1," + + "false,false,https://www.blue.com/page2"); + map.put(BLUE_3, + "84,1,true,0,5,0,-1,0,-1,8,12,0,-1,20,6,0,-1,0,-1," + + "false,false,https://www.blue.com/page3"); + map.put(SEARCH_URL, + "94,1,true,0,5,0,-1,0,-1,8,14,0,-1,22,7,30,6,0,-1," + + "false,false,https://www.google.com/search?q=test"); + map.put(SEARCH_2_URL, + "95,1,true,0,5,0,-1,0,-1,8,14,0,-1,22,7,30,7,0,-1," + + "false,false,https://www.google.com/search?q=query"); + map.put(INITIAL_URL, + "78,1,true,0,5,0,-1,0,-1,8,11,0,-1,19,1,0,-1,0,-1," + + "false,false,https://initial.com/"); + map.put(SPECULATED_URL, + "81,1,true,0,5,0,-1,0,-1,8,14,0,-1,22,1,0,-1,0,-1," + + "false,false,https://speculated.com/"); + map.put(NTP_URL, + "73,1,true,0,6,0,-1,0,-1,9,6,0,-1,15,1,0,-1,0,-1," + + "false,false,chrome://newtab/"); + map.put(NTP_NATIVE_URL, + "82,1,true,0,13,0,-1,0,-1,16,6,0,-1,22,1,0,-1,0,-1,false,false," + + "chrome-native://newtab/"); + map.put(DOM_DISILLER_URL, + "82,1,true,0,16,0,-1,0,-1,19,3,0,-1,22,1,0,-1,0,-1,false,false," + + "chrome-distiller://url/"); + map.put(MAPS_URL, + "82,1,true,0,5,0,-1,0,-1,8,15,0,-1,23,1,0,-1,0,-1,false,false,https://maps.google.com/"); + map.put(AMP_URL, + "116,1,true,0,5,0,-1,0,-1,8,14,0,-1,22,35,0,-1,0,-1,false,false,https://www.google.com/amp/www.nyt.com/ampthml/blogs.html"); + map.put(AMP_CACHE_URL, + "118,1,true,0,5,0,-1,0,-1,8,14,0,-1,22,37,0,-1,0,-1,false,false,https://www.google.com/amp/s/www.nyt.com/ampthml/blogs.html"); + map.put(TEXT_FRAGMENT_URL, + "100,1,true,0,5,0,-1,0,-1,8,15,0,-1,23,1,0,-1,25,16,false,false,https://www.example.com/#:~:text=selector"); + map.put(MULTI_TEXT_FRAGMENT_URL, + "131,1,true,0,5,0,-1,0,-1,8,15,0,-1,23,1,0,-1,25,47,false,false,https://www.example.com/#:~:text=selector1&text=selector2&text=selector3"); + map.put(INVALID_URL, + "73,1,false,0,4,0,-1,0,-1,7,7,0,-1,14,1,0,-1,0,-1,false,false,http://0x100.0/"); + map.put(GOOGLE_URL, + "80,1,true,0,4,0,-1,0,-1,7,14,0,-1,21,1,0,-1,0,-1,false,false,http://www.google.com/"); + map.put(GOOGLE_URL_DOGS, + "84,1,true,0,4,0,-1,0,-1,7,14,0,-1,21,5,0,-1,0,-1,false,false,http://www.google.com/dogs"); + map.put(GOOGLE_URL_DOGS_FUN, + "93,1,true,0,4,0,-1,0,-1,7,14,0,-1,21,13,0,-1,0,-1,false,false,http://www.google.com/dogs-are-fun"); + map.put(GOOGLE_URL_DOG, + "83,1,true,0,4,0,-1,0,-1,7,14,0,-1,21,4,0,-1,0,-1,false,false,http://www.google.com/dog"); + map.put(GOOGLE_URL_CAT, + "83,1,true,0,4,0,-1,0,-1,7,14,0,-1,21,4,0,-1,0,-1,false,false,http://www.google.com/cat"); + map.put(GOOGLE_URL_PIG, + "83,1,true,0,4,0,-1,0,-1,7,14,0,-1,21,4,0,-1,0,-1,false,false,http://www.google.com/pig"); + map.put(ABOUT_BLANK, + "68,1,true,0,5,0,-1,0,-1,0,-1,0,-1,6,5,0,-1,0,-1,false,false,about:blank"); + map.put(CHROME_ABOUT, + "72,1,true,0,6,0,-1,0,-1,9,5,0,-1,14,1,0,-1,0,-1,false,false,chrome://about/"); + sGURLMap = Collections.unmodifiableMap(map); + } + + /** + * @return the GURL resulting from parsing the provided url. Must be registered in |sGURLMap|. + */ + public static GURL getGURL(String url) { + String serialized = sGURLMap.get(url); + if (serialized == null) { + throw new IllegalArgumentException("URL " + url + " not found"); + } + serialized = serialized.replace(',', GURL.SERIALIZER_DELIMITER); + GURL gurl = GURL.deserialize(serialized); + // If you're here looking to use an empty GURL, just use GURL.emptyGURL() directly. + if (gurl.isEmpty()) { + throw new RuntimeException("Could not deserialize: " + serialized); + } + return gurl; + } +} diff --git a/android/test/java/src/org/chromium/url/ShadowGURL.java b/android/test/java/src/org/chromium/url/ShadowGURL.java new file mode 100644 index 00000000000..53e1da192b1 --- /dev/null +++ b/android/test/java/src/org/chromium/url/ShadowGURL.java @@ -0,0 +1,62 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package org.chromium.url; + +import org.robolectric.annotation.Implementation; +import org.robolectric.annotation.Implements; + +import org.chromium.url.GURL.Natives; + +/** + * Shadow of {@link GURL}. Lets Robolectric tests use {@code GURL} without the native libraries + * loaded. + * + *

This shadow can create only GURLs listed in {@link JUnitTestGURLs}. + */ +@Implements(GURL.class) +public class ShadowGURL { + /** + * The {@link GURL.Natives} implementation used by a shadowed {@link GURL}. + */ + private static class NativesImpl implements GURL.Natives { + @Override + public void init(String url, GURL target) { + target.initForTesting(JUnitTestGURLs.getGURL(url)); + } + + @Override + public void getOrigin(String spec, boolean isValid, long nativeParsed, GURL target) { + throw new UnsupportedOperationException( + "ShadowGURL.NativesImpl#getOrigin is not implemented"); + } + + @Override + public boolean domainIs(String spec, boolean isValid, long nativeParsed, String domain) { + throw new UnsupportedOperationException( + "ShadowGURL.NativesImpl#domainIs is not implemented"); + } + + @Override + public long createNative(String spec, boolean isValid, long nativeParsed) { + throw new UnsupportedOperationException( + "ShadowGURL.NativesImpl#createNative is not implemented"); + } + } + private static final NativesImpl sNativesInstance = new NativesImpl(); + + /** + * We could instead shadow {@code GURLJni#get}, but that would require tests using this to load + * both shadows. + */ + @Implementation + protected static Natives getNatives() { + return sNativesInstance; + } + + @Implementation + protected static void ensureNativeInitializedForGURL() { + // Skip native initialization. + } +} diff --git a/features.gni b/features.gni new file mode 100644 index 00000000000..482d8498a82 --- /dev/null +++ b/features.gni @@ -0,0 +1,16 @@ +# Copyright 2016 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# Features used by targets inside and outside of |url|. +# For details see declare_args() in build/config/BUILDCONFIG.gn. +declare_args() { + # Enables the use of ICU alternatives in lieu of ICU for the target toolchain. + # The flag is used for Cronet to reduce the size of the Cronet binary. + use_platform_icu_alternatives = false +} + +# Never use platform icu for host toolchain. +# E.g. Don't apply this for host binaries when target_os = "android". +use_platform_icu_alternatives = + use_platform_icu_alternatives && current_toolchain == default_toolchain diff --git a/gurl.cc b/gurl.cc new file mode 100644 index 00000000000..6930f73b6d5 --- /dev/null +++ b/gurl.cc @@ -0,0 +1,578 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/gurl.h" + +#include + +#include +#include +#include +#include + +#include "base/check_op.h" +#include "base/no_destructor.h" +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" +#include "base/trace_event/base_tracing.h" +#include "base/trace_event/memory_usage_estimator.h" +#include "url/url_canon_stdstring.h" +#include "url/url_util.h" + +GURL::GURL() : is_valid_(false) { +} + +GURL::GURL(const GURL& other) + : spec_(other.spec_), + is_valid_(other.is_valid_), + parsed_(other.parsed_) { + if (other.inner_url_) + inner_url_ = std::make_unique(*other.inner_url_); + // Valid filesystem urls should always have an inner_url_. + DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_); +} + +GURL::GURL(GURL&& other) noexcept + : spec_(std::move(other.spec_)), + is_valid_(other.is_valid_), + parsed_(other.parsed_), + inner_url_(std::move(other.inner_url_)) { + other.is_valid_ = false; + other.parsed_ = url::Parsed(); +} + +GURL::GURL(base::StringPiece url_string) { + InitCanonical(url_string, true); +} + +GURL::GURL(base::StringPiece16 url_string) { + InitCanonical(url_string, true); +} + +GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) { + InitCanonical(url_string, false); +} + +GURL::GURL(const char* canonical_spec, + size_t canonical_spec_len, + const url::Parsed& parsed, + bool is_valid) + : spec_(canonical_spec, canonical_spec_len), + is_valid_(is_valid), + parsed_(parsed) { + InitializeFromCanonicalSpec(); +} + +GURL::GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid) + : spec_(std::move(canonical_spec)), is_valid_(is_valid), parsed_(parsed) { + InitializeFromCanonicalSpec(); +} + +template +void GURL::InitCanonical(T input_spec, bool trim_path_end) { + url::StdStringCanonOutput output(&spec_); + is_valid_ = url::Canonicalize( + input_spec.data(), static_cast(input_spec.length()), trim_path_end, + NULL, &output, &parsed_); + + output.Complete(); // Must be done before using string. + if (is_valid_ && SchemeIsFileSystem()) { + inner_url_ = std::make_unique(spec_.data(), parsed_.Length(), + *parsed_.inner_parsed(), true); + } + // Valid URLs always have non-empty specs. + DCHECK(!is_valid_ || !spec_.empty()); +} + +void GURL::InitializeFromCanonicalSpec() { + if (is_valid_ && SchemeIsFileSystem()) { + inner_url_ = std::make_unique(spec_.data(), parsed_.Length(), + *parsed_.inner_parsed(), true); + } + +#ifndef NDEBUG + // For testing purposes, check that the parsed canonical URL is identical to + // what we would have produced. Skip checking for invalid URLs have no meaning + // and we can't always canonicalize then reproducibly. + if (is_valid_) { + DCHECK(!spec_.empty()); + url::Component scheme; + // We can't do this check on the inner_url of a filesystem URL, as + // canonical_spec actually points to the start of the outer URL, so we'd + // end up with infinite recursion in this constructor. + if (!url::FindAndCompareScheme(spec_.data(), spec_.length(), + url::kFileSystemScheme, &scheme) || + scheme.begin == parsed_.scheme.begin) { + // We need to retain trailing whitespace on path URLs, as the |parsed_| + // spec we originally received may legitimately contain trailing white- + // space on the path or components e.g. if the #ref has been + // removed from a "foo:hello #ref" URL (see http://crbug.com/291747). + GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE); + + DCHECK_EQ(test_url.is_valid_, is_valid_); + DCHECK_EQ(test_url.spec_, spec_); + + DCHECK_EQ(test_url.parsed_.scheme, parsed_.scheme); + DCHECK_EQ(test_url.parsed_.username, parsed_.username); + DCHECK_EQ(test_url.parsed_.password, parsed_.password); + DCHECK_EQ(test_url.parsed_.host, parsed_.host); + DCHECK_EQ(test_url.parsed_.port, parsed_.port); + DCHECK_EQ(test_url.parsed_.path, parsed_.path); + DCHECK_EQ(test_url.parsed_.query, parsed_.query); + DCHECK_EQ(test_url.parsed_.ref, parsed_.ref); + } + } +#endif +} + +GURL::~GURL() = default; + +GURL& GURL::operator=(const GURL& other) { + spec_ = other.spec_; + is_valid_ = other.is_valid_; + parsed_ = other.parsed_; + + if (!other.inner_url_) + inner_url_.reset(); + else if (inner_url_) + *inner_url_ = *other.inner_url_; + else + inner_url_ = std::make_unique(*other.inner_url_); + + return *this; +} + +GURL& GURL::operator=(GURL&& other) noexcept { + spec_ = std::move(other.spec_); + is_valid_ = other.is_valid_; + parsed_ = other.parsed_; + inner_url_ = std::move(other.inner_url_); + + other.is_valid_ = false; + other.parsed_ = url::Parsed(); + return *this; +} + +const std::string& GURL::spec() const { + if (is_valid_ || spec_.empty()) + return spec_; + + DCHECK(false) << "Trying to get the spec of an invalid URL!"; + return base::EmptyString(); +} + +bool GURL::operator<(const GURL& other) const { + return spec_ < other.spec_; +} + +bool GURL::operator>(const GURL& other) const { + return spec_ > other.spec_; +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::Resolve(base::StringPiece relative) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + url::StdStringCanonOutput output(&result.spec_); + if (!url::ResolveRelative(spec_.data(), static_cast(spec_.length()), + parsed_, relative.data(), + static_cast(relative.length()), + nullptr, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + if (result.SchemeIsFileSystem()) { + result.inner_url_ = + std::make_unique(result.spec_.data(), result.parsed_.Length(), + *result.parsed_.inner_parsed(), true); + } + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::Resolve(base::StringPiece16 relative) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + url::StdStringCanonOutput output(&result.spec_); + if (!url::ResolveRelative(spec_.data(), static_cast(spec_.length()), + parsed_, relative.data(), + static_cast(relative.length()), + nullptr, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + if (result.SchemeIsFileSystem()) { + result.inner_url_ = + std::make_unique(result.spec_.data(), result.parsed_.Length(), + *result.parsed_.inner_parsed(), true); + } + return result; +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents(const Replacements& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + url::StdStringCanonOutput output(&result.spec_); + result.is_valid_ = url::ReplaceComponents( + spec_.data(), static_cast(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + + result.ProcessFileSystemURLAfterReplaceComponents(); + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents(const ReplacementsW& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + url::StdStringCanonOutput output(&result.spec_); + result.is_valid_ = url::ReplaceComponents( + spec_.data(), static_cast(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + + result.ProcessFileSystemURLAfterReplaceComponents(); + + return result; +} + +void GURL::ProcessFileSystemURLAfterReplaceComponents() { + if (!is_valid_) + return; + if (SchemeIsFileSystem()) { + inner_url_ = std::make_unique(spec_.data(), parsed_.Length(), + *parsed_.inner_parsed(), true); + } +} + +GURL GURL::DeprecatedGetOriginAsURL() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL. + if (!is_valid_ || !IsStandard()) + return GURL(); + + if (SchemeIsFileSystem()) + return inner_url_->DeprecatedGetOriginAsURL(); + + Replacements replacements; + replacements.ClearUsername(); + replacements.ClearPassword(); + replacements.ClearPath(); + replacements.ClearQuery(); + replacements.ClearRef(); + + return ReplaceComponents(replacements); +} + +GURL GURL::GetAsReferrer() const { + if (!is_valid() || !IsReferrerScheme(spec_.data(), parsed_.scheme)) + return GURL(); + + if (!has_ref() && !has_username() && !has_password()) + return GURL(*this); + + Replacements replacements; + replacements.ClearRef(); + replacements.ClearUsername(); + replacements.ClearPassword(); + return ReplaceComponents(replacements); +} + +GURL GURL::GetWithEmptyPath() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL. + if (!is_valid_ || !IsStandard()) + return GURL(); + + // We could optimize this since we know that the URL is canonical, and we are + // appending a canonical path, so avoiding re-parsing. + GURL other(*this); + if (parsed_.path.len == 0) + return other; + + // Clear everything after the path. + other.parsed_.query.reset(); + other.parsed_.ref.reset(); + + // Set the path, since the path is longer than one, we can just set the + // first character and resize. + other.spec_[other.parsed_.path.begin] = '/'; + other.parsed_.path.len = 1; + other.spec_.resize(other.parsed_.path.begin + 1); + return other; +} + +GURL GURL::GetWithoutFilename() const { + return Resolve("."); +} + +GURL GURL::GetWithoutRef() const { + if (!has_ref()) + return GURL(*this); + + Replacements replacements; + replacements.ClearRef(); + return ReplaceComponents(replacements); +} + +bool GURL::IsStandard() const { + return url::IsStandard(spec_.data(), parsed_.scheme); +} + +bool GURL::IsAboutBlank() const { + return IsAboutUrl(url::kAboutBlankPath); +} + +bool GURL::IsAboutSrcdoc() const { + return IsAboutUrl(url::kAboutSrcdocPath); +} + +bool GURL::SchemeIs(base::StringPiece lower_ascii_scheme) const { + DCHECK(base::IsStringASCII(lower_ascii_scheme)); + DCHECK(base::ToLowerASCII(lower_ascii_scheme) == lower_ascii_scheme); + + if (!has_scheme()) + return lower_ascii_scheme.empty(); + return scheme_piece() == lower_ascii_scheme; +} + +bool GURL::SchemeIsHTTPOrHTTPS() const { + return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kHttpScheme); +} + +bool GURL::SchemeIsWSOrWSS() const { + return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme); +} + +bool GURL::SchemeIsCryptographic() const { + if (!has_scheme()) + return false; + return SchemeIsCryptographic(scheme_piece()); +} + +bool GURL::SchemeIsCryptographic(base::StringPiece lower_ascii_scheme) { + DCHECK(base::IsStringASCII(lower_ascii_scheme)); + DCHECK(base::ToLowerASCII(lower_ascii_scheme) == lower_ascii_scheme); + + return lower_ascii_scheme == url::kHttpsScheme || + lower_ascii_scheme == url::kWssScheme; +} + +bool GURL::SchemeIsLocal() const { + // The `filesystem:` scheme is not in the Fetch spec, but Chromium still + // supports it in large part. It should be treated as a local scheme too. + return SchemeIs(url::kAboutScheme) || SchemeIs(url::kBlobScheme) || + SchemeIs(url::kDataScheme) || SchemeIs(url::kFileSystemScheme); +} + +int GURL::IntPort() const { + if (parsed_.port.is_nonempty()) + return url::ParsePort(spec_.data(), parsed_.port); + return url::PORT_UNSPECIFIED; +} + +int GURL::EffectiveIntPort() const { + int int_port = IntPort(); + if (int_port == url::PORT_UNSPECIFIED && IsStandard()) + return url::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin, + parsed_.scheme.len); + return int_port; +} + +std::string GURL::ExtractFileName() const { + url::Component file_component; + url::ExtractFileName(spec_.data(), parsed_.path, &file_component); + return ComponentString(file_component); +} + +base::StringPiece GURL::PathForRequestPiece() const { + DCHECK(parsed_.path.is_nonempty()) + << "Canonical path for requests should be non-empty"; + if (parsed_.ref.is_valid()) { + // Clip off the reference when it exists. The reference starts after the + // #-sign, so we have to subtract one to also remove it. + return base::StringPiece(spec_).substr( + parsed_.path.begin, parsed_.ref.begin - parsed_.path.begin - 1); + } + // Compute the actual path length, rather than depending on the spec's + // terminator. If we're an inner_url, our spec continues on into our outer + // URL's path/query/ref. + int path_len = parsed_.path.len; + if (parsed_.query.is_valid()) + path_len = parsed_.query.end() - parsed_.path.begin; + + return base::StringPiece(spec_).substr(parsed_.path.begin, path_len); +} + +std::string GURL::PathForRequest() const { + return std::string(PathForRequestPiece()); +} + +std::string GURL::HostNoBrackets() const { + return std::string(HostNoBracketsPiece()); +} + +base::StringPiece GURL::HostNoBracketsPiece() const { + // If host looks like an IPv6 literal, strip the square brackets. + url::Component h(parsed_.host); + if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') { + h.begin++; + h.len -= 2; + } + return ComponentStringPiece(h); +} + +std::string GURL::GetContent() const { + return std::string(GetContentPiece()); +} + +base::StringPiece GURL::GetContentPiece() const { + if (!is_valid_) + return base::StringPiece(); + url::Component content_component = parsed_.GetContent(); + if (!SchemeIs(url::kJavaScriptScheme) && parsed_.ref.is_valid()) + content_component.len -= parsed_.ref.len + 1; + return ComponentStringPiece(content_component); +} + +bool GURL::HostIsIPAddress() const { + return is_valid_ && url::HostIsIPAddress(host_piece()); +} + +const GURL& GURL::EmptyGURL() { + static base::NoDestructor empty_gurl; + return *empty_gurl; +} + +bool GURL::DomainIs(base::StringPiece canonical_domain) const { + if (!is_valid_) + return false; + + // FileSystem URLs have empty host_piece, so check this first. + if (inner_url_ && SchemeIsFileSystem()) + return inner_url_->DomainIs(canonical_domain); + return url::DomainIs(host_piece(), canonical_domain); +} + +bool GURL::EqualsIgnoringRef(const GURL& other) const { + int ref_position = parsed_.CountCharactersBefore(url::Parsed::REF, true); + int ref_position_other = + other.parsed_.CountCharactersBefore(url::Parsed::REF, true); + return base::StringPiece(spec_).substr(0, ref_position) == + base::StringPiece(other.spec_).substr(0, ref_position_other); +} + +void GURL::Swap(GURL* other) { + spec_.swap(other->spec_); + std::swap(is_valid_, other->is_valid_); + std::swap(parsed_, other->parsed_); + inner_url_.swap(other->inner_url_); +} + +size_t GURL::EstimateMemoryUsage() const { + return base::trace_event::EstimateMemoryUsage(spec_) + + base::trace_event::EstimateMemoryUsage(inner_url_) + + (parsed_.inner_parsed() ? sizeof(url::Parsed) : 0); +} + +bool GURL::IsAboutUrl(base::StringPiece allowed_path) const { + if (!SchemeIs(url::kAboutScheme)) + return false; + + if (has_host() || has_username() || has_password() || has_port()) + return false; + + return IsAboutPath(path_piece(), allowed_path); +} + +// static +bool GURL::IsAboutPath(base::StringPiece actual_path, + base::StringPiece allowed_path) { + if (!base::StartsWith(actual_path, allowed_path)) + return false; + + if (actual_path.size() == allowed_path.size()) { + DCHECK_EQ(actual_path, allowed_path); + return true; + } + + if ((actual_path.size() == allowed_path.size() + 1) && + actual_path.back() == '/') { + DCHECK_EQ(actual_path, std::string(allowed_path) + '/'); + return true; + } + + return false; +} + +void GURL::WriteIntoTrace(perfetto::TracedValue context) const { + std::move(context).WriteString(possibly_invalid_spec()); +} + +std::ostream& operator<<(std::ostream& out, const GURL& url) { + return out << url.possibly_invalid_spec(); +} + +bool operator==(const GURL& x, const GURL& y) { + return x.possibly_invalid_spec() == y.possibly_invalid_spec(); +} + +bool operator!=(const GURL& x, const GURL& y) { + return !(x == y); +} + +bool operator==(const GURL& x, const base::StringPiece& spec) { + DCHECK_EQ(GURL(spec).possibly_invalid_spec(), spec) + << "Comparisons of GURLs and strings must ensure as a precondition that " + "the string is fully canonicalized."; + return x.possibly_invalid_spec() == spec; +} + +bool operator==(const base::StringPiece& spec, const GURL& x) { + return x == spec; +} + +bool operator!=(const GURL& x, const base::StringPiece& spec) { + return !(x == spec); +} + +bool operator!=(const base::StringPiece& spec, const GURL& x) { + return !(x == spec); +} + +namespace url::debug { + +ScopedUrlCrashKey::ScopedUrlCrashKey(base::debug::CrashKeyString* crash_key, + const GURL& url) + : scoped_string_value_( + crash_key, + url.is_empty() ? "" : url.possibly_invalid_spec()) {} + +ScopedUrlCrashKey::~ScopedUrlCrashKey() = default; + +} // namespace url::debug diff --git a/gurl.h b/gurl.h new file mode 100644 index 00000000000..688a1018a9b --- /dev/null +++ b/gurl.h @@ -0,0 +1,534 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_GURL_H_ +#define URL_GURL_H_ + +#include + +#include +#include +#include + +#include "base/component_export.h" +#include "base/debug/alias.h" +#include "base/debug/crash_logging.h" +#include "base/strings/string_piece.h" +#include "base/trace_event/base_tracing_forward.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_constants.h" + +// Represents a URL. GURL is Google's URL parsing library. +// +// A parsed canonicalized URL is guaranteed to be UTF-8. Any non-ASCII input +// characters are UTF-8 encoded and % escaped to ASCII. +// +// The string representation of a URL is called the spec(). Getting the +// spec will assert if the URL is invalid to help protect against malicious +// URLs. If you want the "best effort" canonicalization of an invalid URL, you +// can use possibly_invalid_spec(). Test validity with is_valid(). Data and +// javascript URLs use GetContent() to extract the data. +// +// This class has existence checkers and getters for the various components of +// a URL. Existence is different than being nonempty. "http://www.google.com/?" +// has a query that just happens to be empty, and has_query() will return true +// while the query getters will return the empty string. +// +// Prefer not to modify a URL using string operations (though sometimes this is +// unavoidable). Instead, use ReplaceComponents which can replace or delete +// multiple parts of a URL in one step, doesn't re-canonicalize unchanged +// sections, and avoids some screw-ups. An example is creating a URL with a +// path that contains a literal '#'. Using string concatenation will generate a +// URL with a truncated path and a reference fragment, while ReplaceComponents +// will know to escape this and produce the desired result. +class COMPONENT_EXPORT(URL) GURL { + public: + typedef url::StringPieceReplacements Replacements; + typedef url::StringPieceReplacements ReplacementsW; + + // Creates an empty, invalid URL. + GURL(); + + // Copy construction is relatively inexpensive, with most of the time going + // to reallocating the string. It does not re-parse. + GURL(const GURL& other); + GURL(GURL&& other) noexcept; + + // The strings to this contructor should be UTF-8 / UTF-16. + explicit GURL(base::StringPiece url_string); + explicit GURL(base::StringPiece16 url_string); + + // Constructor for URLs that have already been parsed and canonicalized. This + // is used for conversions from KURL, for example. The caller must supply all + // information associated with the URL, which must be correct and consistent. + GURL(const char* canonical_spec, + size_t canonical_spec_len, + const url::Parsed& parsed, + bool is_valid); + // Notice that we take the canonical_spec by value so that we can convert + // from WebURL without copying the string. When we call this constructor + // we pass in a temporary std::string, which lets the compiler skip the + // copy and just move the std::string into the function argument. In the + // implementation, we use std::move to move the data into the GURL itself, + // which means we end up with zero copies. + GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid); + + ~GURL(); + + GURL& operator=(const GURL& other); + GURL& operator=(GURL&& other) noexcept; + + // Returns true when this object represents a valid parsed URL. When not + // valid, other functions will still succeed, but you will not get canonical + // data out in the format you may be expecting. Instead, we keep something + // "reasonable looking" so that the user can see how it's busted if + // displayed to them. + bool is_valid() const { + return is_valid_; + } + + // Returns true if the URL is zero-length. Note that empty URLs are also + // invalid, and is_valid() will return false for them. This is provided + // because some users may want to treat the empty case differently. + bool is_empty() const { + return spec_.empty(); + } + + // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8, + // if the URL is valid. If the URL is not valid, this will assert and return + // the empty string (for safety in release builds, to keep them from being + // misused which might be a security problem). + // + // The URL will be ASCII (non-ASCII characters will be %-escaped UTF-8). + // + // The exception is for empty() URLs (which are !is_valid()) but this will + // return the empty string without asserting. + // + // Use invalid_spec() below to get the unusable spec of an invalid URL. This + // separation is designed to prevent errors that may cause security problems + // that could result from the mistaken use of an invalid URL. + const std::string& spec() const; + + // Returns the potentially invalid spec for a the URL. This spec MUST NOT be + // modified or sent over the network. It is designed to be displayed in error + // messages to the user, as the appearance of the spec may explain the error. + // If the spec is valid, the valid spec will be returned. + // + // The returned string is guaranteed to be valid UTF-8. + const std::string& possibly_invalid_spec() const { + return spec_; + } + + // Getter for the raw parsed structure. This allows callers to locate parts + // of the URL within the spec themselves. Most callers should consider using + // the individual component getters below. + // + // The returned parsed structure will reference into the raw spec, which may + // or may not be valid. If you are using this to index into the spec, BE + // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you + // don't do anything "important" with invalid specs. + const url::Parsed& parsed_for_possibly_invalid_spec() const { + return parsed_; + } + + // Allows GURL to used as a key in STL (for example, a std::set or std::map). + bool operator<(const GURL& other) const; + bool operator>(const GURL& other) const; + + // Resolves a URL that's possibly relative to this object's URL, and returns + // it. Absolute URLs are also handled according to the rules of URLs on web + // pages. + // + // It may be impossible to resolve the URLs properly. If the input is not + // "standard" (IsStandard() == false) and the input looks relative, we can't + // resolve it. In these cases, the result will be an empty, invalid GURL. + // + // The result may also be a nonempty, invalid URL if the input has some kind + // of encoding error. In these cases, we will try to construct a "good" URL + // that may have meaning to the user, but it will be marked invalid. + // + // It is an error to resolve a URL relative to an invalid URL. The result + // will be the empty URL. + GURL Resolve(base::StringPiece relative) const; + GURL Resolve(base::StringPiece16 relative) const; + + // Creates a new GURL by replacing the current URL's components with the + // supplied versions. See the Replacements class in url_canon.h for more. + // + // These are not particularly quick, so avoid doing mutations when possible. + // Prefer the 8-bit version when possible. + // + // It is an error to replace components of an invalid URL. The result will + // be the empty URL. + // + // Note that this intentionally disallows direct use of url::Replacements, + // which is harder to use correctly. + GURL ReplaceComponents(const Replacements& replacements) const; + GURL ReplaceComponents(const ReplacementsW& replacements) const; + + // A helper function that is equivalent to replacing the path with a slash + // and clearing out everything after that. We sometimes need to know just the + // scheme and the authority. If this URL is not a standard URL (it doesn't + // have the regular authority and path sections), then the result will be + // an empty, invalid GURL. Note that this *does* work for file: URLs, which + // some callers may want to filter out before calling this. + // + // It is an error to get an empty path on an invalid URL. The result + // will be the empty URL. + GURL GetWithEmptyPath() const; + + // A helper function to return a GURL without the filename, query values, and + // fragment. For example, + // GURL("https://www.foo.com/index.html?q=test").GetWithoutFilename().spec() + // will return "https://www.foo.com/". + // GURL("https://www.foo.com/bar/").GetWithoutFilename().spec() + // will return "https://www.foo.com/bar/". If the GURL is invalid or missing a + // scheme, authority or path, it will return an empty, invalid GURL. + GURL GetWithoutFilename() const; + + // A helper function to return a GURL without the Ref (also named Fragment + // Identifier). For example, + // GURL("https://www.foo.com/index.html#test").GetWithoutRef().spec() + // will return "https://www.foo.com/index.html". + // If the GURL is invalid or missing a + // scheme, authority or path, it will return an empty, invalid GURL. + GURL GetWithoutRef() const; + + // A helper function to return a GURL containing just the scheme, host, + // and port from a URL. Equivalent to clearing any username and password, + // replacing the path with a slash, and clearing everything after that. If + // this URL is not a standard URL, then the result will be an empty, + // invalid GURL. If the URL has neither username nor password, this + // degenerates to GetWithEmptyPath(). + // + // It is an error to get the origin of an invalid URL. The result + // will be the empty URL. + // + // WARNING: Please avoid converting urls into origins if at all possible! + // //docs/security/origin-vs-url.md is a list of gotchas that can result. Such + // conversions will likely return a wrong result for about:blank and/or + // in the presence of iframe.sandbox attribute. Prefer to get origins directly + // from the source (e.g. RenderFrameHost::GetLastCommittedOrigin). + GURL DeprecatedGetOriginAsURL() const; + + // A helper function to return a GURL stripped from the elements that are not + // supposed to be sent as HTTP referrer: username, password and ref fragment. + // For invalid URLs or URLs that no valid referrers, an empty URL will be + // returned. + GURL GetAsReferrer() const; + + // Returns true if the scheme for the current URL is a known "standard-format" + // scheme. A standard-format scheme adheres to what RFC 3986 calls "generic + // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). This includes + // file: and filesystem:, which some callers may want to filter out explicitly + // by calling SchemeIsFile[System]. + bool IsStandard() const; + + // Returns true when the url is of the form about:blank, about:blank?foo or + // about:blank/#foo. + bool IsAboutBlank() const; + + // Returns true when the url is of the form about:srcdoc, about:srcdoc?foo or + // about:srcdoc/#foo. + bool IsAboutSrcdoc() const; + + // Returns true if the given parameter (should be lower-case ASCII to match + // the canonicalized scheme) is the scheme for this URL. Do not include a + // colon. + bool SchemeIs(base::StringPiece lower_ascii_scheme) const; + + // Returns true if the scheme is "http" or "https". + bool SchemeIsHTTPOrHTTPS() const; + + // Returns true is the scheme is "ws" or "wss". + bool SchemeIsWSOrWSS() const; + + // We often need to know if this is a file URL. File URLs are "standard", but + // are often treated separately by some programs. + bool SchemeIsFile() const { + return SchemeIs(url::kFileScheme); + } + + // FileSystem URLs need to be treated differently in some cases. + bool SchemeIsFileSystem() const { + return SchemeIs(url::kFileSystemScheme); + } + + // Returns true if the scheme indicates a network connection that uses TLS or + // some other cryptographic protocol (e.g. QUIC) for security. + // + // This function is a not a complete test of whether or not an origin's code + // is minimally trustworthy. For that, see Chromium's |IsOriginSecure| for a + // higher-level and more complete semantics. See that function's documentation + // for more detail. + bool SchemeIsCryptographic() const; + + // As above, but static. Parameter should be lower-case ASCII. + static bool SchemeIsCryptographic(base::StringPiece lower_ascii_scheme); + + // Returns true if the scheme is "blob". + bool SchemeIsBlob() const { + return SchemeIs(url::kBlobScheme); + } + + // Returns true if the scheme is a local scheme, as defined in Fetch: + // https://fetch.spec.whatwg.org/#local-scheme + bool SchemeIsLocal() const; + + // For most URLs, the "content" is everything after the scheme (skipping the + // scheme delimiting colon) and before the fragment (skipping the fragment + // delimiting octothorpe). For javascript URLs the "content" also includes the + // fragment delimiter and fragment. + // + // It is an error to get the content of an invalid URL: the result will be an + // empty string. + std::string GetContent() const; + base::StringPiece GetContentPiece() const; + + // Returns true if the hostname is an IP address. Note: this function isn't + // as cheap as a simple getter because it re-parses the hostname to verify. + bool HostIsIPAddress() const; + + // Not including the colon. If you are comparing schemes, prefer SchemeIs. + bool has_scheme() const { return parsed_.scheme.is_valid(); } + std::string scheme() const { + return ComponentString(parsed_.scheme); + } + base::StringPiece scheme_piece() const { + return ComponentStringPiece(parsed_.scheme); + } + + bool has_username() const { return parsed_.username.is_valid(); } + std::string username() const { + return ComponentString(parsed_.username); + } + base::StringPiece username_piece() const { + return ComponentStringPiece(parsed_.username); + } + + bool has_password() const { return parsed_.password.is_valid(); } + std::string password() const { + return ComponentString(parsed_.password); + } + base::StringPiece password_piece() const { + return ComponentStringPiece(parsed_.password); + } + + // The host may be a hostname, an IPv4 address, or an IPv6 literal surrounded + // by square brackets, like "[2001:db8::1]". To exclude these brackets, use + // HostNoBrackets() below. + bool has_host() const { + // Note that hosts are special, absence of host means length 0. + return parsed_.host.is_nonempty(); + } + std::string host() const { + return ComponentString(parsed_.host); + } + base::StringPiece host_piece() const { + return ComponentStringPiece(parsed_.host); + } + + // The port if one is explicitly specified. Most callers will want IntPort() + // or EffectiveIntPort() instead of these. The getters will not include the + // ':'. + bool has_port() const { return parsed_.port.is_valid(); } + std::string port() const { + return ComponentString(parsed_.port); + } + base::StringPiece port_piece() const { + return ComponentStringPiece(parsed_.port); + } + + // Including first slash following host, up to the query. The URL + // "http://www.google.com/" has a path of "/". + bool has_path() const { return parsed_.path.is_valid(); } + std::string path() const { + return ComponentString(parsed_.path); + } + base::StringPiece path_piece() const { + return ComponentStringPiece(parsed_.path); + } + + // Stuff following '?' up to the ref. The getters will not include the '?'. + bool has_query() const { return parsed_.query.is_valid(); } + std::string query() const { + return ComponentString(parsed_.query); + } + base::StringPiece query_piece() const { + return ComponentStringPiece(parsed_.query); + } + + // Stuff following '#' to the end of the string. This will be %-escaped UTF-8. + // The getters will not include the '#'. + bool has_ref() const { return parsed_.ref.is_valid(); } + std::string ref() const { + return ComponentString(parsed_.ref); + } + base::StringPiece ref_piece() const { + return ComponentStringPiece(parsed_.ref); + } + + // Returns a parsed version of the port. Can also be any of the special + // values defined in Parsed for ExtractPort. + int IntPort() const; + + // Returns the port number of the URL, or the default port number. + // If the scheme has no concept of port (or unknown default) returns + // PORT_UNSPECIFIED. + int EffectiveIntPort() const; + + // Extracts the filename portion of the path and returns it. The filename + // is everything after the last slash in the path. This may be empty. + std::string ExtractFileName() const; + + // Returns the path that should be sent to the server. This is the path, + // parameter, and query portions of the URL. It is guaranteed to be ASCII. + std::string PathForRequest() const; + + // Returns the same characters as PathForRequest(), avoiding a copy. + base::StringPiece PathForRequestPiece() const; + + // Returns the host, excluding the square brackets surrounding IPv6 address + // literals. This can be useful for passing to getaddrinfo(). + std::string HostNoBrackets() const; + + // Returns the same characters as HostNoBrackets(), avoiding a copy. + base::StringPiece HostNoBracketsPiece() const; + + // Returns true if this URL's host matches or is in the same domain as + // the given input string. For example, if the hostname of the URL is + // "www.google.com", this will return true for "com", "google.com", and + // "www.google.com". + // + // The input domain should match host canonicalization rules. i.e. the input + // should be lowercase except for escape chars. + // + // This call is more efficient than getting the host and checking whether the + // host has the specific domain or not because no copies or object + // constructions are done. + bool DomainIs(base::StringPiece canonical_domain) const; + + // Checks whether or not two URLs differ only in the ref (the part after + // the # character). + bool EqualsIgnoringRef(const GURL& other) const; + + // Swaps the contents of this GURL object with |other|, without doing + // any memory allocations. + void Swap(GURL* other); + + // Returns a reference to a singleton empty GURL. This object is for callers + // who return references but don't have anything to return in some cases. + // If you just want an empty URL for normal use, prefer GURL(). This function + // may be called from any thread. + static const GURL& EmptyGURL(); + + // Returns the inner URL of a nested URL (currently only non-null for + // filesystem URLs). + // + // TODO(mmenke): inner_url().spec() currently returns the same value as + // caling spec() on the GURL itself. This should be fixed. + // See https://crbug.com/619596 + const GURL* inner_url() const { + return inner_url_.get(); + } + + // Estimates dynamic memory usage. + // See base/trace_event/memory_usage_estimator.h for more info. + size_t EstimateMemoryUsage() const; + + // Helper used by GURL::IsAboutUrl and KURL::IsAboutURL. + static bool IsAboutPath(base::StringPiece actual_path, + base::StringPiece allowed_path); + + void WriteIntoTrace(perfetto::TracedValue context) const; + + private: + // Variant of the string parsing constructor that allows the caller to elect + // retain trailing whitespace, if any, on the passed URL spec, but only if + // the scheme is one that allows trailing whitespace. The primary use-case is + // for data: URLs. In most cases, you want to use the single parameter + // constructor above. + enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE }; + GURL(const std::string& url_string, RetainWhiteSpaceSelector); + + template + void InitCanonical(T input_spec, bool trim_path_end); + + void InitializeFromCanonicalSpec(); + + // Helper used by IsAboutBlank and IsAboutSrcdoc. + bool IsAboutUrl(base::StringPiece allowed_path) const; + + // Returns the substring of the input identified by the given component. + std::string ComponentString(const url::Component& comp) const { + return std::string(ComponentStringPiece(comp)); + } + base::StringPiece ComponentStringPiece(const url::Component& comp) const { + if (comp.is_empty()) + return base::StringPiece(); + return base::StringPiece(spec_).substr(static_cast(comp.begin), + static_cast(comp.len)); + } + + void ProcessFileSystemURLAfterReplaceComponents(); + + // The actual text of the URL, in canonical ASCII form. + std::string spec_; + + // Set when the given URL is valid. Otherwise, we may still have a spec and + // components, but they may not identify valid resources (for example, an + // invalid port number, invalid characters in the scheme, etc.). + bool is_valid_; + + // Identified components of the canonical spec. + url::Parsed parsed_; + + // Used for nested schemes [currently only filesystem:]. + std::unique_ptr inner_url_; +}; + +// Stream operator so GURL can be used in assertion statements. +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, const GURL& url); + +COMPONENT_EXPORT(URL) bool operator==(const GURL& x, const GURL& y); +COMPONENT_EXPORT(URL) bool operator!=(const GURL& x, const GURL& y); + +// Equality operator for comparing raw spec_. This should be used in place of +// url == GURL(spec) where |spec| is known (i.e. constants). This is to prevent +// needlessly re-parsing |spec| into a temporary GURL. +COMPONENT_EXPORT(URL) +bool operator==(const GURL& x, const base::StringPiece& spec); +COMPONENT_EXPORT(URL) +bool operator==(const base::StringPiece& spec, const GURL& x); +COMPONENT_EXPORT(URL) +bool operator!=(const GURL& x, const base::StringPiece& spec); +COMPONENT_EXPORT(URL) +bool operator!=(const base::StringPiece& spec, const GURL& x); + +// DEBUG_ALIAS_FOR_GURL(var_name, url) copies |url| into a new stack-allocated +// variable named ||. This helps ensure that the value of |url| gets +// preserved in crash dumps. +#define DEBUG_ALIAS_FOR_GURL(var_name, url) \ + DEBUG_ALIAS_FOR_CSTR(var_name, (url).possibly_invalid_spec().c_str(), 128) + +namespace url::debug { + +class COMPONENT_EXPORT(URL) ScopedUrlCrashKey { + public: + ScopedUrlCrashKey(base::debug::CrashKeyString* crash_key, const GURL& value); + ~ScopedUrlCrashKey(); + + ScopedUrlCrashKey(const ScopedUrlCrashKey&) = delete; + ScopedUrlCrashKey& operator=(const ScopedUrlCrashKey&) = delete; + + private: + base::debug::ScopedCrashKeyString scoped_string_value_; +}; + +} // namespace url::debug + +#endif // URL_GURL_H_ diff --git a/gurl_abstract_tests.h b/gurl_abstract_tests.h new file mode 100644 index 00000000000..3cde8420567 --- /dev/null +++ b/gurl_abstract_tests.h @@ -0,0 +1,119 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_GURL_ABSTRACT_TESTS_H_ +#define URL_GURL_ABSTRACT_TESTS_H_ + +// Test suite for tests that cover both url::Url and blink::SecurityUrl. +// +// AbstractUrlTest below abstracts away differences between GURL and blink::KURL +// by parametrizing the tests with a class that has to expose the following +// members: +// using UrlType = ...; +// static UrlType CreateUrlFromString(base::StringPiece s); +// static bool IsAboutBlank(const UrlType& url); +// static bool IsAboutSrcdoc(const UrlType& url); +template +class AbstractUrlTest : public testing::Test { + protected: + // Wrappers that help ellide away TUrlTraits. + // + // Note that calling the wrappers needs to be prefixed with `this->...` to + // avoid hitting: explicit qualification required to use member 'IsAboutBlank' + // from dependent base class. + using UrlType = typename TUrlTraits::UrlType; + UrlType CreateUrlFromString(base::StringPiece s) { + return TUrlTraits::CreateUrlFromString(s); + } + bool IsAboutBlank(const UrlType& url) { + return TUrlTraits::IsAboutBlank(url); + } + bool IsAboutSrcdoc(const UrlType& url) { + return TUrlTraits::IsAboutSrcdoc(url); + } +}; + +TYPED_TEST_SUITE_P(AbstractUrlTest); + +TYPED_TEST_P(AbstractUrlTest, IsAboutBlankTest) { + // See https://tools.ietf.org/html/rfc6694 which explicitly allows + // `about-query` and `about-fragment` parts in about: URLs. + const std::string kAboutBlankUrls[] = {"about:blank", "about:blank?foo", + "about:blank/#foo", + "about:blank?foo#foo"}; + for (const auto& input : kAboutBlankUrls) { + SCOPED_TRACE(testing::Message() << "Test input: " << input); + auto url = this->CreateUrlFromString(input); + EXPECT_TRUE(this->IsAboutBlank(url)); + } + + const std::string kNotAboutBlankUrls[] = {"", + "about", + "about:", + "about:blanky", + "about:blan", + "about:about:blank:", + "data:blank", + "http:blank", + "about://blank", + "about:blank/foo", + "about://:8000/blank", + "about://foo:foo@/blank", + "foo@about:blank", + "foo:bar@about:blank", + "about:blank:8000", + "about:blANk"}; + for (const auto& input : kNotAboutBlankUrls) { + SCOPED_TRACE(testing::Message() << "Test input: " << input); + auto url = this->CreateUrlFromString(input); + EXPECT_FALSE(this->IsAboutBlank(url)); + } +} + +TYPED_TEST_P(AbstractUrlTest, IsAboutSrcdocTest) { + // See https://tools.ietf.org/html/rfc6694 which explicitly allows + // `about-query` and `about-fragment` parts in about: URLs. + // + // `about:srcdoc` is defined in + // https://html.spec.whatwg.org/multipage/urls-and-fetching.html#about:srcdoc + // which refers to rfc6694 for details. + const std::string kAboutSrcdocUrls[] = { + "about:srcdoc", "about:srcdoc/", "about:srcdoc?foo", "about:srcdoc/#foo", + "about:srcdoc?foo#foo"}; + for (const auto& input : kAboutSrcdocUrls) { + SCOPED_TRACE(testing::Message() << "Test input: " << input); + auto url = this->CreateUrlFromString(input); + EXPECT_TRUE(this->IsAboutSrcdoc(url)); + } + + const std::string kNotAboutSrcdocUrls[] = {"", + "about", + "about:", + "about:srcdocx", + "about:srcdo", + "about:about:srcdoc:", + "data:srcdoc", + "http:srcdoc", + "about:srcdo", + "about://srcdoc", + "about://srcdoc\\", + "about:srcdoc/foo", + "about://:8000/srcdoc", + "about://foo:foo@/srcdoc", + "foo@about:srcdoc", + "foo:bar@about:srcdoc", + "about:srcdoc:8000", + "about:srCDOc"}; + for (const auto& input : kNotAboutSrcdocUrls) { + SCOPED_TRACE(testing::Message() << "Test input: " << input); + auto url = this->CreateUrlFromString(input); + EXPECT_FALSE(this->IsAboutSrcdoc(url)); + } +} + +REGISTER_TYPED_TEST_SUITE_P(AbstractUrlTest, + IsAboutBlankTest, + IsAboutSrcdocTest); + +#endif // URL_GURL_ABSTRACT_TESTS_H_ diff --git a/gurl_fuzzer.cc b/gurl_fuzzer.cc new file mode 100644 index 00000000000..029a387e4c2 --- /dev/null +++ b/gurl_fuzzer.cc @@ -0,0 +1,89 @@ +// Copyright 2015 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/at_exit.h" +#include "base/check_op.h" +#include "base/i18n/icu_util.h" +#include "base/no_destructor.h" +#include "url/gurl.h" + +struct TestCase { + TestCase() { CHECK(base::i18n::InitializeICU()); } + + // used by ICU integration. + base::AtExitManager at_exit_manager; +}; + +TestCase* test_case = new TestCase(); + +// Checks that GURL's canonicalization is idempotent. This can help discover +// issues like https://crbug.com/1128999. +void CheckIdempotency(const GURL& url) { + if (!url.is_valid()) + return; + const std::string& spec = url.spec(); + GURL recanonicalized(spec); + CHECK(recanonicalized.is_valid()); + CHECK_EQ(spec, recanonicalized.spec()); +} + +// Checks that |url.spec()| is preserved across a call to ReplaceComponents with +// zero replacements, which is effectively a copy. This can help discover issues +// like https://crbug.com/1075515. +void CheckReplaceComponentsPreservesSpec(const GURL& url) { + static const base::NoDestructor no_op; + GURL copy = url.ReplaceComponents(*no_op); + CHECK_EQ(url.is_valid(), copy.is_valid()); + if (url.is_valid()) { + CHECK_EQ(url.spec(), copy.spec()); + } +} + +// Entry point for LibFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + if (size < 1) + return 0; + { + base::StringPiece string_piece_input(reinterpret_cast(data), + size); + const GURL url_from_string_piece(string_piece_input); + CheckIdempotency(url_from_string_piece); + CheckReplaceComponentsPreservesSpec(url_from_string_piece); + } + // Test for StringPiece16 if size is even. + if (size % sizeof(char16_t) == 0) { + base::StringPiece16 string_piece_input16( + reinterpret_cast(data), size / sizeof(char16_t)); + const GURL url_from_string_piece16(string_piece_input16); + CheckIdempotency(url_from_string_piece16); + CheckReplaceComponentsPreservesSpec(url_from_string_piece16); + } + // Resolve relative url tests. + { + size_t size_t_bytes = sizeof(size_t); + if (size < size_t_bytes + 1) { + return 0; + } + size_t relative_size = + *reinterpret_cast(data) % (size - size_t_bytes); + std::string relative_string( + reinterpret_cast(data + size_t_bytes), relative_size); + base::StringPiece string_piece_part_input( + reinterpret_cast(data + size_t_bytes + relative_size), + size - relative_size - size_t_bytes); + const GURL url_from_string_piece_part(string_piece_part_input); + CheckIdempotency(url_from_string_piece_part); + CheckReplaceComponentsPreservesSpec(url_from_string_piece_part); + + url_from_string_piece_part.Resolve(relative_string); + + if (relative_size % sizeof(char16_t) == 0) { + std::u16string relative_string16( + reinterpret_cast(data + size_t_bytes), + relative_size / sizeof(char16_t)); + url_from_string_piece_part.Resolve(relative_string16); + } + } + return 0; +} diff --git a/gurl_fuzzer.dict b/gurl_fuzzer.dict new file mode 100644 index 00000000000..fcf7e035dd6 --- /dev/null +++ b/gurl_fuzzer.dict @@ -0,0 +1,432 @@ +# Copyright 2016 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# This block has been generated with testing/libfuzzer/dictionary_generator.py +# using url_parse_fuzzer binary and RFC 3986. +"DNS" +"text" +"TCP" +"\"%D3%81%87%A4%95%81@%C2%85%81%83%88\"." +"[RFC2234]" +"F.," +"FORCE" +"SOCIETY" +"implementation" +"TASK" +"cache" +"WINS," +"D.1." +"to" +"only" +"HTML" +"SPONSORED" +"[RFC1630]." +"D.," +"[RFC1123]" +"resources" +"(STD" +"[RFC1808]," +"string" +"returning" +"==" +"H" +"HEREIN" +"[BCP35]" +"SP)" +"SCTP)" +"(NUL)" +"THE" +"(URI):" +"REPRESENTS" +"resource" +"A.," +"EXPRESS" +"list" +"(%2E)," +"WILL" +"J." +"INCLUDING" +"segment." +"[RFC2732]" +"(URL)\"," +"set" +"HTTP" +"IANA" +"INFORMATION" +"(%41-%5A" +"[RFC2518]" +"M." +"direct" +"(IDNA)\"," +"Only" +"Version" +"are" +"allowed." +"\"X\"" +"(SP)." +"2DIGIT" +"section" +"BUT" +"\"UTF-8," +"3" +"version" +"[RFC1034]" +"probably" +"[RFC2732]." +"metadata" +"Y.," +"C" +"WWW\"" +"FOR" +"0X" +"S" +"address" +"INPUT" +"[" +"P." +"WWW:" +"AND" +"WWW" +"[BCP35]." +"MA" +"\"AS" +"\"%\"" +"NOT" +"ANY" +"[RFC1808]" +"WARRANTY" +"useful" +"[RFC1737]." +"[STD63]," +"\"HTTP\"" +"(MIME)" +"TELNET" +"[RFC1630]" +"S." +"D.2." +"B.," +"[RFC2234]." +"[RFC2234]," +"BCP" +"[STD63];" +"use" +"LATIN" +"from" +"C." +"0" +"WARRANTIES" +"(MHTML)\"," +"ENGINEERING" +"URI;" +"few" +"(DNS)." +"expected" +"USENET" +"type" +"empty" +"XML" +"URL?\"," +"W3C/MIT" +"F" +"CA" +"STD:" +"SMTP" +"[RFC2141]," +"N" +"A)," +"NOTE:" +"CR" +"MHTML" +"must" +"ANY)," +"ALL" +"[STD63]" +"RIGHTS" +"HE/SHE" +"SP" +"[BCP19]" +"value" +"INFRINGE" +"while" +"KATAKANA" +"US-ASCII" +"W3C/IETF" +"loop" +"J.," +"2E:" +"L." +"have" +"%61-%7A)," +"is" +"PARTICULAR" +"thus" +"URI," +"parse" +"STEP" +"MIME" +"UTF-8" +"in" +"failed" +"LF" +"binary" +"ISO/IEC" +"\"A" +"(%5F)," +")" +"HTTP," +"get" +"\"A\"," +"[RFC2141]" +"BUFFER" +"ABNF" +"[RFC2557]." +"I." +"WARRANTIES," +"URN" +"EBCDIC" +"A" +"used" +"http" +"may" +"IP" +"IS" +"after" +"L" +"Q" +"'A'" +"running" +"HEXDIG" +"such" +"EBCDIC," +"data" +"[ASCII]" +"a" +"P" +"[ASCII]." +"M.," +"Names" +"the" +"[RFC0952]." +"[RFC3490]" +"US-ASCII." +"2C:" +"THAT" +"E.," +"(%2D)," +"\"URL:\"" +"WITH" +"BY" +"[UCS]," +"tables" +"[UCS]" +"TO" +"BNF" +"internal" +"P.," +"ORGANIZATION" +"\"HTTP" +"URI." +"it," +"D" +"format" +"URL" +"(0" +"URI\"" +"URI" +"K." +"URI:" +"T" +"D.W." +"not" +"R." +"LIMITED" +"\"%3A\")" +"name" +"OF" +"B." +"[RFC1736]" +"(R)," +"IPR" +"[RFC1738];" +"OUTPUT" +"LALR" +"OR" +"STD" +"[RFC3513]" +"because" +"bytes" +"DNS," +"back" +"(URI)" +"*DIGIT" +"[RFC2046]" +"[RFC3305]" +"W3C" +"E." +"for" +"space" +"ABNF\"," +"[RFC1535]." +"DQUOTE" +"I" +"does" +"'F'" +"[RFC2396]" +"be" +"K.," +"DISCLAIM" +"G" +"(UTF-16)," +"This" +"M" +"INTERNET" +"RFC" +"X3.4," +"base" +"(T):" +"IMPLIED," +"by" +"\"URL\"" +"on" +"DIGIT" +"(ABNF)" +"WEBDAV\"," +"of" +"could" +"R.," +"(ABNF:" +"S.," +"1*4HEXDIG" +"CAPITAL" +"number" +"one" +"ISO" +"FITNESS" +"\"%7E\"" +"open" +"ANSI" +"[BCP19]," +"\"%C3%80\"," +"IETF" +"support" +"\"URN" +"[RFC1123]." +"long" +"[RFC0952]" +":" +"was" +"[RFC3513]." +"[RFC2718]" +"B" +"N." +"that" +"IDNA" +"OCTET" +"but" +"R" +"POSIX" +"LETTER" +"CONTRIBUTOR," +"[RFC1738]" +"(C)" +"with" +"\"URI\"" +"16" +"default" +"double" +"\"URN\"" +"[RFC2557]" +"up" +"TCP," +"PURPOSE." +"MERCHANTABILITY" +"1)" +"IS\"" +"\"IANA" +"(URN)" +"and" +"USE" +"false" +"(IF" +"USA" +"URL," +"an" +"To" +"as" +"(%7E)" +"at" +"file" +"need" +"any" +"\"%E3%82%A2\"." +"physical" +"1*HEXDIG" +"no" +"[RFC1737]" +"-" +"invalid" +"A." +"application" +"valid" +"take" +"which" +"test" +"[RFC2732]," +"you" +"=" +"GRAVE" +"" +"[RFC2396]," +"2B:" +"period," +"UDP," +"[RFC1535]" +"T." +"(UCS)\"," +"U" +"A-F." +"T.," +"]" +"[RFC2718]." +"D." +"persistent" +"traditional" +"L.," +"As" +"IMPLIED" +"(URL)" +"ALPHA" +"[RFC3305]." +"H.," +"\"MIME" + +# This comes from https://crbug.com/1075515. +"FilEsysteM:htTp:E=/." + +# This comes from https://crbug.com/1128999. +"file:///.//" +"file:////" + +# string declared from url/url_constants.cc +"://" +"about" +"about:blank" +"about:srcdoc" +"blank" +"blob" +"cid" +"content" +"data" +"file" +"filesystem" +"ftp" +"http" +"https" +"javascript" +"mailto" +"quic-transport" +"srcdoc" +"tel" +"ws" +"wss" diff --git a/gurl_unittest.cc b/gurl_unittest.cc new file mode 100644 index 00000000000..af8421d9742 --- /dev/null +++ b/gurl_unittest.cc @@ -0,0 +1,1180 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/gurl.h" + +#include + +#include "base/strings/string_number_conversions.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl_abstract_tests.h" +#include "url/origin.h" +#include "url/url_canon.h" +#include "url/url_test_utils.h" + +namespace url { + +namespace { + +// Returns the canonicalized string for the given URL string for the +// GURLTest.Types test. +std::string TypesTestCase(const char* src) { + GURL gurl(src); + return gurl.possibly_invalid_spec(); +} + +} // namespace + +// Different types of URLs should be handled differently, and handed off to +// different canonicalizers. +TEST(GURLTest, Types) { + // URLs with unknown schemes should be treated as path URLs, even when they + // have things like "://". + EXPECT_EQ("something:///HOSTNAME.com/", + TypesTestCase("something:///HOSTNAME.com/")); + + // Conversely, URLs with known schemes should always trigger standard URL + // handling. + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com")); + EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com")); + +#ifdef WIN32 + // URLs that look like Windows absolute path specs. + EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt")); + EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt")); + EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt")); + EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt")); +#endif +} + +// Test the basic creation and querying of components in a GURL. We assume that +// the parser is already tested and works, so we are mostly interested if the +// object does the right thing with the results. +TEST(GURLTest, Components) { + GURL empty_url(u""); + EXPECT_TRUE(empty_url.is_empty()); + EXPECT_FALSE(empty_url.is_valid()); + + GURL url(u"http://user:pass@google.com:99/foo;bar?q=a#ref"); + EXPECT_FALSE(url.is_empty()); + EXPECT_TRUE(url.is_valid()); + EXPECT_TRUE(url.SchemeIs("http")); + EXPECT_FALSE(url.SchemeIsFile()); + + // This is the narrow version of the URL, which should match the wide input. + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec()); + + EXPECT_EQ("http", url.scheme()); + EXPECT_EQ("user", url.username()); + EXPECT_EQ("pass", url.password()); + EXPECT_EQ("google.com", url.host()); + EXPECT_EQ("99", url.port()); + EXPECT_EQ(99, url.IntPort()); + EXPECT_EQ("/foo;bar", url.path()); + EXPECT_EQ("q=a", url.query()); + EXPECT_EQ("ref", url.ref()); + + // Test parsing userinfo with special characters. + GURL url_special_pass("http://user:%40!$&'()*+,;=:@google.com:12345"); + EXPECT_TRUE(url_special_pass.is_valid()); + // GURL canonicalizes some delimiters. + EXPECT_EQ("%40!$&%27()*+,%3B%3D%3A", url_special_pass.password()); + EXPECT_EQ("google.com", url_special_pass.host()); + EXPECT_EQ("12345", url_special_pass.port()); +} + +TEST(GURLTest, Empty) { + GURL url; + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ("", url.spec()); + + EXPECT_EQ("", url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ("", url.host()); + EXPECT_EQ("", url.port()); + EXPECT_EQ(PORT_UNSPECIFIED, url.IntPort()); + EXPECT_EQ("", url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); +} + +TEST(GURLTest, Copy) { + GURL url(u"http://user:pass@google.com:99/foo;bar?q=a#ref"); + + GURL url2(url); + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("http", url2.scheme()); + EXPECT_EQ("user", url2.username()); + EXPECT_EQ("pass", url2.password()); + EXPECT_EQ("google.com", url2.host()); + EXPECT_EQ("99", url2.port()); + EXPECT_EQ(99, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + // Copying of invalid URL should be invalid + GURL invalid; + GURL invalid2(invalid); + EXPECT_FALSE(invalid2.is_valid()); + EXPECT_EQ("", invalid2.spec()); + EXPECT_EQ("", invalid2.scheme()); + EXPECT_EQ("", invalid2.username()); + EXPECT_EQ("", invalid2.password()); + EXPECT_EQ("", invalid2.host()); + EXPECT_EQ("", invalid2.port()); + EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort()); + EXPECT_EQ("", invalid2.path()); + EXPECT_EQ("", invalid2.query()); + EXPECT_EQ("", invalid2.ref()); +} + +TEST(GURLTest, Assign) { + GURL url(u"http://user:pass@google.com:99/foo;bar?q=a#ref"); + + GURL url2; + url2 = url; + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("http", url2.scheme()); + EXPECT_EQ("user", url2.username()); + EXPECT_EQ("pass", url2.password()); + EXPECT_EQ("google.com", url2.host()); + EXPECT_EQ("99", url2.port()); + EXPECT_EQ(99, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + // Assignment of invalid URL should be invalid + GURL invalid; + GURL invalid2; + invalid2 = invalid; + EXPECT_FALSE(invalid2.is_valid()); + EXPECT_EQ("", invalid2.spec()); + EXPECT_EQ("", invalid2.scheme()); + EXPECT_EQ("", invalid2.username()); + EXPECT_EQ("", invalid2.password()); + EXPECT_EQ("", invalid2.host()); + EXPECT_EQ("", invalid2.port()); + EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort()); + EXPECT_EQ("", invalid2.path()); + EXPECT_EQ("", invalid2.query()); + EXPECT_EQ("", invalid2.ref()); +} + +// This is a regression test for http://crbug.com/309975. +TEST(GURLTest, SelfAssign) { + GURL a("filesystem:http://example.com/temporary/"); + // This should not crash. + a = *&a; // The *& defeats Clang's -Wself-assign warning. +} + +TEST(GURLTest, CopyFileSystem) { + GURL url(u"filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref"); + + GURL url2(url); + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("filesystem:https://google.com:99/t/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("filesystem", url2.scheme()); + EXPECT_EQ("", url2.username()); + EXPECT_EQ("", url2.password()); + EXPECT_EQ("", url2.host()); + EXPECT_EQ("", url2.port()); + EXPECT_EQ(PORT_UNSPECIFIED, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + const GURL* inner = url2.inner_url(); + ASSERT_TRUE(inner); + EXPECT_EQ("https", inner->scheme()); + EXPECT_EQ("", inner->username()); + EXPECT_EQ("", inner->password()); + EXPECT_EQ("google.com", inner->host()); + EXPECT_EQ("99", inner->port()); + EXPECT_EQ(99, inner->IntPort()); + EXPECT_EQ("/t", inner->path()); + EXPECT_EQ("", inner->query()); + EXPECT_EQ("", inner->ref()); +} + +TEST(GURLTest, IsValid) { + const char* valid_cases[] = { + "http://google.com", + "unknown://google.com", + "http://user:pass@google.com", + "http://google.com:12345", + "http://google.com:0", // 0 is a valid port + "http://google.com/path", + "http://google.com//path", + "http://google.com?k=v#fragment", + "http://user:pass@google.com:12345/path?k=v#fragment", + "http:/path", + "http:path", + }; + for (size_t i = 0; i < std::size(valid_cases); i++) { + EXPECT_TRUE(GURL(valid_cases[i]).is_valid()) + << "Case: " << valid_cases[i]; + } + + const char* invalid_cases[] = { + "http://?k=v", + "http:://google.com", + "http//google.com", + "http://google.com:12three45", + "file://server:123", // file: URLs cannot have a port + "file://server:0", + "://google.com", + "path", + }; + for (size_t i = 0; i < std::size(invalid_cases); i++) { + EXPECT_FALSE(GURL(invalid_cases[i]).is_valid()) + << "Case: " << invalid_cases[i]; + } +} + +TEST(GURLTest, ExtraSlashesBeforeAuthority) { + // According to RFC3986, the hierarchical part for URI with an authority + // must use only two slashes; GURL intentionally just ignores extra slashes + // if there are more than 2, and parses the following part as an authority. + GURL url("http:///host"); + EXPECT_EQ("host", url.host()); + EXPECT_EQ("/", url.path()); +} + +// Given invalid URLs, we should still get most of the components. +TEST(GURLTest, ComponentGettersWorkEvenForInvalidURL) { + constexpr struct InvalidURLTestExpectations { + const char* url; + const char* spec; + const char* scheme; + const char* host; + const char* port; + const char* path; + // Extend as needed... + } expectations[] = { + { + "http:google.com:foo", + "http://google.com:foo/", + "http", + "google.com", + "foo", + "/", + }, + { + "https:google.com:foo", + "https://google.com:foo/", + "https", + "google.com", + "foo", + "/", + }, + }; + + for (const auto& e : expectations) { + const GURL url(e.url); + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ(e.spec, url.possibly_invalid_spec()); + EXPECT_EQ(e.scheme, url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ(e.host, url.host()); + EXPECT_EQ(e.port, url.port()); + EXPECT_EQ(PORT_INVALID, url.IntPort()); + EXPECT_EQ(e.path, url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); + } +} + +TEST(GURLTest, Resolve) { + // The tricky cases for relative URL resolving are tested in the + // canonicalizer unit test. Here, we just test that the GURL integration + // works properly. + struct ResolveCase { + const char* base; + const char* relative; + bool expected_valid; + const char* expected; + } resolve_cases[] = { + {"http://www.google.com/", "foo.html", true, + "http://www.google.com/foo.html"}, + {"http://www.google.com/foo/", "bar", true, + "http://www.google.com/foo/bar"}, + {"http://www.google.com/foo/", "/bar", true, "http://www.google.com/bar"}, + {"http://www.google.com/foo", "bar", true, "http://www.google.com/bar"}, + {"http://www.google.com/", "http://images.google.com/foo.html", true, + "http://images.google.com/foo.html"}, + {"http://www.google.com/", "http://images.\tgoogle.\ncom/\rfoo.html", + true, "http://images.google.com/foo.html"}, + {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", + true, "http://www.google.com/hello/world.html?a#b"}, + {"http://www.google.com/foo#bar", "#com", true, + "http://www.google.com/foo#com"}, + {"http://www.google.com/", "Https:images.google.com", true, + "https://images.google.com/"}, + // A non-standard base can be replaced with a standard absolute URL. + {"data:blahblah", "http://google.com/", true, "http://google.com/"}, + {"data:blahblah", "http:google.com", true, "http://google.com/"}, + {"data:blahblah", "https:google.com", true, "https://google.com/"}, + // Filesystem URLs have different paths to test. + {"filesystem:http://www.google.com/type/", "foo.html", true, + "filesystem:http://www.google.com/type/foo.html"}, + {"filesystem:http://www.google.com/type/", "../foo.html", true, + "filesystem:http://www.google.com/type/foo.html"}, + // https://crbug.com/530123 - scheme validation (e.g. are "10.0.0.7:" + // or "x1:" valid schemes) when deciding if |relative| is an absolute url. + {"file:///some/dir/ip-relative.html", "10.0.0.7:8080/foo.html", true, + "file:///some/dir/10.0.0.7:8080/foo.html"}, + {"file:///some/dir/", "1://host", true, "file:///some/dir/1://host"}, + {"file:///some/dir/", "x1://host", true, "x1://host"}, + {"file:///some/dir/", "X1://host", true, "x1://host"}, + {"file:///some/dir/", "x.://host", true, "x.://host"}, + {"file:///some/dir/", "x+://host", true, "x+://host"}, + {"file:///some/dir/", "x-://host", true, "x-://host"}, + {"file:///some/dir/", "x!://host", true, "file:///some/dir/x!://host"}, + {"file:///some/dir/", "://host", true, "file:///some/dir/://host"}, + }; + + for (size_t i = 0; i < std::size(resolve_cases); i++) { + // 8-bit code path. + GURL input(resolve_cases[i].base); + GURL output = input.Resolve(resolve_cases[i].relative); + EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i; + EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i; + EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL); + + // Wide code path. + GURL inputw(base::UTF8ToUTF16(resolve_cases[i].base)); + GURL outputw = + input.Resolve(base::UTF8ToUTF16(resolve_cases[i].relative)); + EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i; + EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i; + EXPECT_EQ(outputw.SchemeIsFileSystem(), outputw.inner_url() != NULL); + } +} + +TEST(GURLTest, GetOrigin) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello,world\");", ""}, + {"http://user:pass@www.google.com:21/blah#baz", + "http://www.google.com:21/"}, + {"http://user@www.google.com", "http://www.google.com/"}, + {"http://:pass@www.google.com", "http://www.google.com/"}, + {"http://:@www.google.com", "http://www.google.com/"}, + {"filesystem:http://www.google.com/temp/foo?q#b", + "http://www.google.com/"}, + {"filesystem:http://user:pass@google.com:21/blah#baz", + "http://google.com:21/"}, + {"blob:null/guid-goes-here", ""}, + {"blob:http://origin/guid-goes-here", "" /* should be http://origin/ */}, + }; + for (size_t i = 0; i < std::size(cases); i++) { + GURL url(cases[i].input); + GURL origin = url.DeprecatedGetOriginAsURL(); + EXPECT_EQ(cases[i].expected, origin.spec()); + } +} + +TEST(GURLTest, GetAsReferrer) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/blah"}, + {"http://user@www.google.com", "http://www.google.com/"}, + {"http://:pass@www.google.com", "http://www.google.com/"}, + {"http://:@www.google.com", "http://www.google.com/"}, + {"http://www.google.com/temp/foo?q#b", "http://www.google.com/temp/foo?q"}, + {"not a url", ""}, + {"unknown-scheme://foo.html", ""}, + {"file:///tmp/test.html", ""}, + {"https://www.google.com", "https://www.google.com/"}, + }; + for (size_t i = 0; i < std::size(cases); i++) { + GURL url(cases[i].input); + GURL origin = url.GetAsReferrer(); + EXPECT_EQ(cases[i].expected, origin.spec()); + } +} + +TEST(GURLTest, GetWithEmptyPath) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello, world\");", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"}, + {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"}, + {"filesystem:file:///temporary/bar.html?baz=22", "filesystem:file:///temporary/"}, + }; + + for (size_t i = 0; i < std::size(cases); i++) { + GURL url(cases[i].input); + GURL empty_path = url.GetWithEmptyPath(); + EXPECT_EQ(cases[i].expected, empty_path.spec()); + } +} + +TEST(GURLTest, GetWithoutFilename) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + // Common Standard URLs. + {"https://www.google.com", "https://www.google.com/"}, + {"https://www.google.com/", "https://www.google.com/"}, + {"https://www.google.com/maps.htm", "https://www.google.com/"}, + {"https://www.google.com/maps/", "https://www.google.com/maps/"}, + {"https://www.google.com/index.html", "https://www.google.com/"}, + {"https://www.google.com/index.html?q=maps", "https://www.google.com/"}, + {"https://www.google.com/index.html#maps/", "https://www.google.com/"}, + {"https://foo:bar@www.google.com/maps.htm", "https://foo:bar@www.google.com/"}, + {"https://www.google.com/maps/au/index.html", "https://www.google.com/maps/au/"}, + {"https://www.google.com/maps/au/north", "https://www.google.com/maps/au/"}, + {"https://www.google.com/maps/au/north/", "https://www.google.com/maps/au/north/"}, + {"https://www.google.com/maps/au/index.html?q=maps#fragment/", "https://www.google.com/maps/au/"}, + {"http://www.google.com:8000/maps/au/index.html?q=maps#fragment/", "http://www.google.com:8000/maps/au/"}, + {"https://www.google.com/maps/au/north/?q=maps#fragment", "https://www.google.com/maps/au/north/"}, + {"https://www.google.com/maps/au/north?q=maps#fragment", "https://www.google.com/maps/au/"}, + // Less common standard URLs. + {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"}, + {"file:///temporary/bar.html?baz=22","file:///temporary/"}, + {"ftp://foo/test/index.html", "ftp://foo/test/"}, + {"gopher://foo/test/index.html", "gopher://foo/test/"}, + {"ws://foo/test/index.html", "ws://foo/test/"}, + // Non-standard, hierarchical URLs. + {"chrome://foo/bar.html", "chrome://foo/"}, + {"httpa://foo/test/index.html", "httpa://foo/test/"}, + // Non-standard, non-hierarchical URLs. + {"blob:https://foo.bar/test/index.html", ""}, + {"about:blank", ""}, + {"data:foobar", ""}, + {"scheme:opaque_data", ""}, + // Invalid URLs. + {"foobar", ""}, + }; + + for (size_t i = 0; i < std::size(cases); i++) { + GURL url(cases[i].input); + GURL without_filename = url.GetWithoutFilename(); + EXPECT_EQ(cases[i].expected, without_filename.spec()) << i; + } +} + +TEST(GURLTest, GetWithoutRef) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + // Common Standard URLs. + {"https://www.google.com/index.html", + "https://www.google.com/index.html"}, + {"https://www.google.com/index.html#maps/", + "https://www.google.com/index.html"}, + + {"https://foo:bar@www.google.com/maps.htm", + "https://foo:bar@www.google.com/maps.htm"}, + {"https://foo:bar@www.google.com/maps.htm#fragment", + "https://foo:bar@www.google.com/maps.htm"}, + + {"https://www.google.com/maps/au/index.html?q=maps", + "https://www.google.com/maps/au/index.html?q=maps"}, + {"https://www.google.com/maps/au/index.html?q=maps#fragment/", + "https://www.google.com/maps/au/index.html?q=maps"}, + + {"http://www.google.com:8000/maps/au/index.html?q=maps", + "http://www.google.com:8000/maps/au/index.html?q=maps"}, + {"http://www.google.com:8000/maps/au/index.html?q=maps#fragment/", + "http://www.google.com:8000/maps/au/index.html?q=maps"}, + + {"https://www.google.com/maps/au/north/?q=maps", + "https://www.google.com/maps/au/north/?q=maps"}, + {"https://www.google.com/maps/au/north?q=maps#fragment", + "https://www.google.com/maps/au/north?q=maps"}, + + // Less common standard URLs. + {"filesystem:http://www.google.com/temporary/bar.html?baz=22", + "filesystem:http://www.google.com/temporary/bar.html?baz=22"}, + {"file:///temporary/bar.html?baz=22#fragment", + "file:///temporary/bar.html?baz=22"}, + + {"ftp://foo/test/index.html", "ftp://foo/test/index.html"}, + {"ftp://foo/test/index.html#fragment", "ftp://foo/test/index.html"}, + + {"gopher://foo/test/index.html", "gopher://foo/test/index.html"}, + {"gopher://foo/test/index.html#fragment", "gopher://foo/test/index.html"}, + + {"ws://foo/test/index.html", "ws://foo/test/index.html"}, + {"ws://foo/test/index.html#fragment", "ws://foo/test/index.html"}, + + // Non-standard, hierarchical URLs. + {"chrome://foo/bar.html", "chrome://foo/bar.html"}, + {"chrome://foo/bar.html#fragment", "chrome://foo/bar.html"}, + + {"httpa://foo/test/index.html", "httpa://foo/test/index.html"}, + {"httpa://foo/test/index.html#fragment", "httpa://foo/test/index.html"}, + + // Non-standard, non-hierarchical URLs. + {"blob:https://foo.bar/test/index.html", + "blob:https://foo.bar/test/index.html"}, + {"blob:https://foo.bar/test/index.html#fragment", + "blob:https://foo.bar/test/index.html"}, + + {"about:blank", "about:blank"}, + {"about:blank#ref", "about:blank"}, + + {"data:foobar", "data:foobar"}, + {"scheme:opaque_data", "scheme:opaque_data"}, + // Invalid URLs. + {"foobar", ""}, + }; + + for (size_t i = 0; i < std::size(cases); i++) { + GURL url(cases[i].input); + GURL without_ref = url.GetWithoutRef(); + EXPECT_EQ(cases[i].expected, without_ref.spec()); + } +} + +TEST(GURLTest, Replacements) { + // The URL canonicalizer replacement test will handle most of these case. + // The most important thing to do here is to check that the proper + // canonicalizer gets called based on the scheme of the input. + struct ReplaceCase { + using ApplyReplacementsFunc = GURL(const GURL&); + + const char* base; + ApplyReplacementsFunc* apply_replacements; + const char* expected; + } replace_cases[] = { + {.base = "http://www.google.com/foo/bar.html?foo#bar", + .apply_replacements = + +[](const GURL& url) { + GURL::Replacements replacements; + replacements.SetPathStr("/"); + replacements.ClearQuery(); + replacements.ClearRef(); + return url.ReplaceComponents(replacements); + }, + .expected = "http://www.google.com/"}, + {.base = "http://www.google.com/foo/bar.html?foo#bar", + .apply_replacements = + +[](const GURL& url) { + GURL::Replacements replacements; + replacements.SetSchemeStr("javascript"); + replacements.ClearUsername(); + replacements.ClearPassword(); + replacements.ClearHost(); + replacements.ClearPort(); + replacements.SetPathStr("window.open('foo');"); + replacements.ClearQuery(); + replacements.ClearRef(); + return url.ReplaceComponents(replacements); + }, + .expected = "javascript:window.open('foo');"}, + {.base = "file:///C:/foo/bar.txt", + .apply_replacements = + +[](const GURL& url) { + GURL::Replacements replacements; + replacements.SetSchemeStr("http"); + replacements.SetHostStr("www.google.com"); + replacements.SetPortStr("99"); + replacements.SetPathStr("/foo"); + replacements.SetQueryStr("search"); + replacements.SetRefStr("ref"); + return url.ReplaceComponents(replacements); + }, + .expected = "http://www.google.com:99/foo?search#ref"}, +#ifdef WIN32 + {.base = "http://www.google.com/foo/bar.html?foo#bar", + .apply_replacements = + +[](const GURL& url) { + GURL::Replacements replacements; + replacements.SetSchemeStr("file"); + replacements.ClearUsername(); + replacements.ClearPassword(); + replacements.ClearHost(); + replacements.ClearPort(); + replacements.SetPathStr("c:\\"); + replacements.ClearQuery(); + replacements.ClearRef(); + return url.ReplaceComponents(replacements); + }, + .expected = "file:///C:/"}, +#endif + {.base = "filesystem:http://www.google.com/foo/bar.html?foo#bar", + .apply_replacements = + +[](const GURL& url) { + GURL::Replacements replacements; + replacements.SetPathStr("/"); + replacements.ClearQuery(); + replacements.ClearRef(); + return url.ReplaceComponents(replacements); + }, + .expected = "filesystem:http://www.google.com/foo/"}, + // Lengthen the URL instead of shortening it, to test creation of + // inner_url. + {.base = "filesystem:http://www.google.com/foo/", + .apply_replacements = + +[](const GURL& url) { + GURL::Replacements replacements; + replacements.SetPathStr("bar.html"); + replacements.SetQueryStr("foo"); + replacements.SetRefStr("bar"); + return url.ReplaceComponents(replacements); + }, + .expected = "filesystem:http://www.google.com/foo/bar.html?foo#bar"}, + }; + + for (const ReplaceCase& c : replace_cases) { + GURL output = c.apply_replacements(GURL(c.base)); + + EXPECT_EQ(c.expected, output.spec()); + + EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL); + if (output.SchemeIsFileSystem()) { + // TODO(mmenke): inner_url()->spec() is currently the same as the spec() + // for the GURL itself. This should be fixed. + // See https://crbug.com/619596 + EXPECT_EQ(c.expected, output.inner_url()->spec()); + } + } +} + +TEST(GURLTest, ClearFragmentOnDataUrl) { + // http://crbug.com/291747 - a data URL may legitimately have trailing + // whitespace in the spec after the ref is cleared. Test this does not trigger + // the Parsed importing validation DCHECK in GURL. + GURL url(" data: one # two "); + EXPECT_TRUE(url.is_valid()); + + // By default the trailing whitespace will have been stripped. + EXPECT_EQ("data: one #%20two", url.spec()); + + // Clear the URL's ref and observe the trailing whitespace. + GURL::Replacements repl; + repl.ClearRef(); + GURL url_no_ref = url.ReplaceComponents(repl); + EXPECT_TRUE(url_no_ref.is_valid()); + EXPECT_EQ("data: one ", url_no_ref.spec()); + + // Importing a parsed URL via this constructor overload will retain trailing + // whitespace. + GURL import_url(url_no_ref.spec(), + url_no_ref.parsed_for_possibly_invalid_spec(), + url_no_ref.is_valid()); + EXPECT_TRUE(import_url.is_valid()); + EXPECT_EQ(url_no_ref, import_url); + EXPECT_EQ("data: one ", import_url.spec()); + EXPECT_EQ(" one ", import_url.path()); + + // For completeness, test that re-parsing the same URL rather than importing + // it trims the trailing whitespace. + GURL reparsed_url(url_no_ref.spec()); + EXPECT_TRUE(reparsed_url.is_valid()); + EXPECT_EQ("data: one", reparsed_url.spec()); +} + +TEST(GURLTest, PathForRequest) { + struct TestCase { + const char* input; + const char* expected; + const char* inner_expected; + } cases[] = { + {"http://www.google.com", "/", nullptr}, + {"http://www.google.com/", "/", nullptr}, + {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22", + nullptr}, + {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html", nullptr}, + {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query", + nullptr}, + {"filesystem:http://www.google.com/temporary/foo/bar.html?query#ref", + "/foo/bar.html?query", "/temporary"}, + {"filesystem:http://www.google.com/temporary/foo/bar.html?query", + "/foo/bar.html?query", "/temporary"}, + }; + + for (size_t i = 0; i < std::size(cases); i++) { + GURL url(cases[i].input); + EXPECT_EQ(cases[i].expected, url.PathForRequest()); + EXPECT_EQ(cases[i].expected, url.PathForRequestPiece()); + EXPECT_EQ(cases[i].inner_expected == NULL, url.inner_url() == NULL); + if (url.inner_url() && cases[i].inner_expected) { + EXPECT_EQ(cases[i].inner_expected, url.inner_url()->PathForRequest()); + EXPECT_EQ(cases[i].inner_expected, + url.inner_url()->PathForRequestPiece()); + } + } +} + +TEST(GURLTest, EffectiveIntPort) { + struct PortTest { + const char* spec; + int expected_int_port; + } port_tests[] = { + // http + {"http://www.google.com/", 80}, + {"http://www.google.com:80/", 80}, + {"http://www.google.com:443/", 443}, + + // https + {"https://www.google.com/", 443}, + {"https://www.google.com:443/", 443}, + {"https://www.google.com:80/", 80}, + + // ftp + {"ftp://www.google.com/", 21}, + {"ftp://www.google.com:21/", 21}, + {"ftp://www.google.com:80/", 80}, + + // file - no port + {"file://www.google.com/", PORT_UNSPECIFIED}, + {"file://www.google.com:443/", PORT_UNSPECIFIED}, + + // data - no port + {"data:www.google.com:90", PORT_UNSPECIFIED}, + {"data:www.google.com", PORT_UNSPECIFIED}, + + // filesystem - no port + {"filesystem:http://www.google.com:90/t/foo", PORT_UNSPECIFIED}, + {"filesystem:file:///t/foo", PORT_UNSPECIFIED}, + }; + + for (size_t i = 0; i < std::size(port_tests); i++) { + GURL url(port_tests[i].spec); + EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort()); + } +} + +TEST(GURLTest, IPAddress) { + struct IPTest { + const char* spec; + bool expected_ip; + } ip_tests[] = { + {"http://www.google.com/", false}, + {"http://192.168.9.1/", true}, + {"http://192.168.9.1.2/", false}, + {"http://192.168.m.1/", false}, + {"http://2001:db8::1/", false}, + {"http://[2001:db8::1]/", true}, + {"", false}, + {"some random input!", false}, + }; + + for (size_t i = 0; i < std::size(ip_tests); i++) { + GURL url(ip_tests[i].spec); + EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress()); + } +} + +TEST(GURLTest, HostNoBrackets) { + struct TestCase { + const char* input; + const char* expected_host; + const char* expected_plainhost; + } cases[] = { + {"http://www.google.com", "www.google.com", "www.google.com"}, + {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"}, + {"http://[::]/", "[::]", "::"}, + + // Don't require a valid URL, but don't crash either. + {"http://[]/", "[]", ""}, + {"http://[x]/", "[x]", "x"}, + {"http://[x/", "[x", "[x"}, + {"http://x]/", "x]", "x]"}, + {"http://[/", "[", "["}, + {"http://]/", "]", "]"}, + {"", "", ""}, + }; + for (size_t i = 0; i < std::size(cases); i++) { + GURL url(cases[i].input); + EXPECT_EQ(cases[i].expected_host, url.host()); + EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets()); + EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBracketsPiece()); + } +} + +TEST(GURLTest, DomainIs) { + GURL url_1("http://google.com/foo"); + EXPECT_TRUE(url_1.DomainIs("google.com")); + + // Subdomain and port are ignored. + GURL url_2("http://www.google.com:99/foo"); + EXPECT_TRUE(url_2.DomainIs("google.com")); + + // Different top-level domain. + GURL url_3("http://www.google.com.cn/foo"); + EXPECT_FALSE(url_3.DomainIs("google.com")); + + // Different host name. + GURL url_4("http://www.iamnotgoogle.com/foo"); + EXPECT_FALSE(url_4.DomainIs("google.com")); + + // The input must be lower-cased otherwise DomainIs returns false. + GURL url_5("http://www.google.com/foo"); + EXPECT_FALSE(url_5.DomainIs("Google.com")); + + // If the URL is invalid, DomainIs returns false. + GURL invalid_url("google.com"); + EXPECT_FALSE(invalid_url.is_valid()); + EXPECT_FALSE(invalid_url.DomainIs("google.com")); + + GURL url_with_escape_chars("https://www.,.test"); + EXPECT_TRUE(url_with_escape_chars.is_valid()); + EXPECT_EQ(url_with_escape_chars.host(), "www.%2C.test"); + EXPECT_TRUE(url_with_escape_chars.DomainIs("%2C.test")); +} + +TEST(GURLTest, DomainIsTerminatingDotBehavior) { + // If the host part ends with a dot, it matches input domains + // with or without a dot. + GURL url_with_dot("http://www.google.com./foo"); + EXPECT_TRUE(url_with_dot.DomainIs("google.com")); + EXPECT_TRUE(url_with_dot.DomainIs("google.com.")); + EXPECT_TRUE(url_with_dot.DomainIs(".com")); + EXPECT_TRUE(url_with_dot.DomainIs(".com.")); + + // But, if the host name doesn't end with a dot and the input + // domain does, then it's considered to not match. + GURL url_without_dot("http://google.com/foo"); + EXPECT_FALSE(url_without_dot.DomainIs("google.com.")); + + // If the URL ends with two dots, it doesn't match. + GURL url_with_two_dots("http://www.google.com../foo"); + EXPECT_FALSE(url_with_two_dots.DomainIs("google.com")); +} + +TEST(GURLTest, DomainIsWithFilesystemScheme) { + GURL url_1("filesystem:http://www.google.com:99/foo/"); + EXPECT_TRUE(url_1.DomainIs("google.com")); + + GURL url_2("filesystem:http://www.iamnotgoogle.com/foo/"); + EXPECT_FALSE(url_2.DomainIs("google.com")); +} + +// Newlines should be stripped from inputs. +TEST(GURLTest, Newlines) { + // Constructor. + GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n "); + EXPECT_EQ("http://www.google.com/asdf", url_1.spec()); + EXPECT_FALSE( + url_1.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Relative path resolver. + GURL url_2 = url_1.Resolve(" \n /fo\to\r "); + EXPECT_EQ("http://www.google.com/foo", url_2.spec()); + EXPECT_FALSE( + url_2.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Constructor. + GURL url_3(" \t ht\ntp://\twww.goo\rgle.com/as\ndf< \n "); + EXPECT_EQ("http://www.google.com/asdf%3C", url_3.spec()); + EXPECT_TRUE( + url_3.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Relative path resolver. + GURL url_4 = url_1.Resolve(" \n /fo\to<\r "); + EXPECT_EQ("http://www.google.com/foo%3C", url_4.spec()); + EXPECT_TRUE( + url_4.parsed_for_possibly_invalid_spec().potentially_dangling_markup); + + // Note that newlines are NOT stripped from ReplaceComponents. +} + +TEST(GURLTest, IsStandard) { + GURL a("http:foo/bar"); + EXPECT_TRUE(a.IsStandard()); + + GURL b("foo:bar/baz"); + EXPECT_FALSE(b.IsStandard()); + + GURL c("foo://bar/baz"); + EXPECT_FALSE(c.IsStandard()); + + GURL d("cid:bar@baz"); + EXPECT_FALSE(d.IsStandard()); +} + +TEST(GURLTest, SchemeIsHTTPOrHTTPS) { + EXPECT_TRUE(GURL("http://bar/").SchemeIsHTTPOrHTTPS()); + EXPECT_TRUE(GURL("HTTPS://BAR").SchemeIsHTTPOrHTTPS()); + EXPECT_FALSE(GURL("ftp://bar/").SchemeIsHTTPOrHTTPS()); +} + +TEST(GURLTest, SchemeIsWSOrWSS) { + EXPECT_TRUE(GURL("WS://BAR/").SchemeIsWSOrWSS()); + EXPECT_TRUE(GURL("wss://bar/").SchemeIsWSOrWSS()); + EXPECT_FALSE(GURL("http://bar/").SchemeIsWSOrWSS()); +} + +TEST(GURLTest, SchemeIsCryptographic) { + EXPECT_TRUE(GURL("https://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("HTTPS://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("HtTpS://foo.bar.com/").SchemeIsCryptographic()); + + EXPECT_TRUE(GURL("wss://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("WSS://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_TRUE(GURL("WsS://foo.bar.com/").SchemeIsCryptographic()); + + EXPECT_FALSE(GURL("http://foo.bar.com/").SchemeIsCryptographic()); + EXPECT_FALSE(GURL("ws://foo.bar.com/").SchemeIsCryptographic()); +} + +TEST(GURLTest, SchemeIsCryptographicStatic) { + EXPECT_TRUE(GURL::SchemeIsCryptographic("https")); + EXPECT_TRUE(GURL::SchemeIsCryptographic("wss")); + EXPECT_FALSE(GURL::SchemeIsCryptographic("http")); + EXPECT_FALSE(GURL::SchemeIsCryptographic("ws")); + EXPECT_FALSE(GURL::SchemeIsCryptographic("ftp")); +} + +TEST(GURLTest, SchemeIsBlob) { + EXPECT_TRUE(GURL("BLOB://BAR/").SchemeIsBlob()); + EXPECT_TRUE(GURL("blob://bar/").SchemeIsBlob()); + EXPECT_FALSE(GURL("http://bar/").SchemeIsBlob()); +} + +TEST(GURLTest, SchemeIsLocal) { + EXPECT_TRUE(GURL("BLOB://BAR/").SchemeIsLocal()); + EXPECT_TRUE(GURL("blob://bar/").SchemeIsLocal()); + EXPECT_TRUE(GURL("DATA:TEXT/HTML,BAR").SchemeIsLocal()); + EXPECT_TRUE(GURL("data:text/html,bar").SchemeIsLocal()); + EXPECT_TRUE(GURL("ABOUT:BAR").SchemeIsLocal()); + EXPECT_TRUE(GURL("about:bar").SchemeIsLocal()); + EXPECT_TRUE(GURL("FILESYSTEM:HTTP://FOO.EXAMPLE/BAR").SchemeIsLocal()); + EXPECT_TRUE(GURL("filesystem:http://foo.example/bar").SchemeIsLocal()); + + EXPECT_FALSE(GURL("http://bar/").SchemeIsLocal()); + EXPECT_FALSE(GURL("file:///bar").SchemeIsLocal()); +} + +// Tests that the 'content' of the URL is properly extracted. This can be +// complex in cases such as multiple schemes (view-source:http:) or for +// javascript URLs. See GURL::GetContent for more details. +TEST(GURLTest, ContentForNonStandardURLs) { + struct TestCase { + const char* url; + const char* expected; + } cases[] = { + {"null", ""}, + {"not-a-standard-scheme:this is arbitrary content", + "this is arbitrary content"}, + + // When there are multiple schemes, only the first is excluded from the + // content. Note also that for e.g. 'http://', the '//' is part of the + // content not the scheme. + {"view-source:http://example.com/path", "http://example.com/path"}, + {"blob:http://example.com/GUID", "http://example.com/GUID"}, + {"blob://http://example.com/GUID", "//http://example.com/GUID"}, + {"blob:http://user:password@example.com/GUID", + "http://user:password@example.com/GUID"}, + + // The octothorpe character ('#') marks the end of the URL content, and + // the start of the fragment. It should not be included in the content. + {"http://www.example.com/GUID#ref", "www.example.com/GUID"}, + {"http://me:secret@example.com/GUID/#ref", "me:secret@example.com/GUID/"}, + {"data:text/html,Question?

idea
", + "text/html,Question?%3Cdiv%20style=%22color:%20"}, + + // TODO(mkwst): This seems like a bug. https://crbug.com/513600 + {"filesystem:http://example.com/path", "/"}, + + // Javascript URLs include '#' symbols in their content. + {"javascript:#", "#"}, + {"javascript:alert('#');", "alert('#');"}, + }; + + for (const auto& test : cases) { + GURL url(test.url); + EXPECT_EQ(test.expected, url.GetContent()) << test.url; + EXPECT_EQ(test.expected, url.GetContentPiece()) << test.url; + } +} + +// Tests that the URL path is properly extracted for unusual URLs. This can be +// complex in cases such as multiple schemes (view-source:http:) or when +// octothorpes ('#') are involved. +TEST(GURLTest, PathForNonStandardURLs) { + struct TestCase { + const char* url; + const char* expected; + } cases[] = { + {"null", ""}, + {"not-a-standard-scheme:this is arbitrary content", + "this is arbitrary content"}, + {"view-source:http://example.com/path", "http://example.com/path"}, + {"blob:http://example.com/GUID", "http://example.com/GUID"}, + {"blob://http://example.com/GUID", "//http://example.com/GUID"}, + {"blob:http://user:password@example.com/GUID", + "http://user:password@example.com/GUID"}, + + {"http://www.example.com/GUID#ref", "/GUID"}, + {"http://me:secret@example.com/GUID/#ref", "/GUID/"}, + {"data:text/html,Question?
idea
", + "text/html,Question"}, + + // TODO(mkwst): This seems like a bug. https://crbug.com/513600 + {"filesystem:http://example.com/path", "/"}, + }; + + for (const auto& test : cases) { + GURL url(test.url); + EXPECT_EQ(test.expected, url.path()) << test.url; + } +} + +TEST(GURLTest, EqualsIgnoringRef) { + const struct { + const char* url_a; + const char* url_b; + bool are_equals; + } kTestCases[] = { + // No ref. + {"http://a.com", "http://a.com", true}, + {"http://a.com", "http://b.com", false}, + + // Same Ref. + {"http://a.com#foo", "http://a.com#foo", true}, + {"http://a.com#foo", "http://b.com#foo", false}, + + // Different Refs. + {"http://a.com#foo", "http://a.com#bar", true}, + {"http://a.com#foo", "http://b.com#bar", false}, + + // One has a ref, the other doesn't. + {"http://a.com#foo", "http://a.com", true}, + {"http://a.com#foo", "http://b.com", false}, + + // Empty refs. + {"http://a.com#", "http://a.com#", true}, + {"http://a.com#", "http://a.com", true}, + + // URLs that differ only by their last character. + {"http://aaa", "http://aab", false}, + {"http://aaa#foo", "http://aab#foo", false}, + + // Different size of the part before the ref. + {"http://123#a", "http://123456#a", false}, + + // Blob URLs + {"blob:http://a.com#foo", "blob:http://a.com#foo", true}, + {"blob:http://a.com#foo", "blob:http://a.com#bar", true}, + {"blob:http://a.com#foo", "blob:http://b.com#bar", false}, + + // Filesystem URLs + {"filesystem:http://a.com#foo", "filesystem:http://a.com#foo", true}, + {"filesystem:http://a.com#foo", "filesystem:http://a.com#bar", true}, + {"filesystem:http://a.com#foo", "filesystem:http://b.com#bar", false}, + + // Data URLs + {"data:text/html,a#foo", "data:text/html,a#bar", true}, + {"data:text/html,a#foo", "data:text/html,a#foo", true}, + {"data:text/html,a#foo", "data:text/html,b#foo", false}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() + << std::endl + << "url_a = " << test_case.url_a << std::endl + << "url_b = " << test_case.url_b << std::endl); + // A versus B. + EXPECT_EQ(test_case.are_equals, + GURL(test_case.url_a).EqualsIgnoringRef(GURL(test_case.url_b))); + // B versus A. + EXPECT_EQ(test_case.are_equals, + GURL(test_case.url_b).EqualsIgnoringRef(GURL(test_case.url_a))); + } +} + +TEST(GURLTest, DebugAlias) { + GURL url("https://foo.com/bar"); + DEBUG_ALIAS_FOR_GURL(url_debug_alias, url); + EXPECT_STREQ("https://foo.com/bar", url_debug_alias); +} + +TEST(GURLTest, InvalidHost) { + // This contains an invalid percent escape (%T%) and also a valid + // percent escape that's not 7-bit ascii (%ae), so that the unescaped + // host contains both an invalid percent escape and invalid UTF-8. + GURL url("http://%T%Ae"); + + EXPECT_FALSE(url.is_valid()); + EXPECT_TRUE(url.SchemeIs(url::kHttpScheme)); + + // The invalid percent escape becomes an escaped percent sign (%25), and the + // invalid UTF-8 character becomes REPLACEMENT CHARACTER' (U+FFFD) encoded as + // UTF-8. + EXPECT_EQ(url.host_piece(), "%25t%EF%BF%BD"); +} + +TEST(GURLTest, PortZero) { + GURL port_zero_url("http://127.0.0.1:0/blah"); + + // https://url.spec.whatwg.org/#port-state says that the port 1) consists of + // ASCII digits (this excludes negative numbers) and 2) cannot be greater than + // 2^16-1. This means that port=0 should be valid. + EXPECT_TRUE(port_zero_url.is_valid()); + EXPECT_EQ("0", port_zero_url.port()); + EXPECT_EQ("127.0.0.1", port_zero_url.host()); + EXPECT_EQ("http", port_zero_url.scheme()); + + // https://crbug.com/1065532: SchemeHostPort would previously incorrectly + // consider port=0 to be invalid. + SchemeHostPort scheme_host_port(port_zero_url); + EXPECT_TRUE(scheme_host_port.IsValid()); + EXPECT_EQ(port_zero_url.scheme(), scheme_host_port.scheme()); + EXPECT_EQ(port_zero_url.host(), scheme_host_port.host()); + EXPECT_EQ(port_zero_url.port(), + base::NumberToString(scheme_host_port.port())); + + // https://crbug.com/1065532: The SchemeHostPort problem above would lead to + // bizarre results below - resolved origin would incorrectly be returned as an + // opaque origin derived from |another_origin|. + url::Origin another_origin = url::Origin::Create(GURL("http://other.com")); + url::Origin resolved_origin = + url::Origin::Resolve(port_zero_url, another_origin); + EXPECT_FALSE(resolved_origin.opaque()); + EXPECT_EQ(port_zero_url.scheme(), resolved_origin.scheme()); + EXPECT_EQ(port_zero_url.host(), resolved_origin.host()); + EXPECT_EQ(port_zero_url.port(), base::NumberToString(resolved_origin.port())); + + // port=0 and default HTTP port are different. + GURL default_port("http://127.0.0.1/foo"); + EXPECT_EQ(0, SchemeHostPort(port_zero_url).port()); + EXPECT_EQ(80, SchemeHostPort(default_port).port()); + url::Origin default_port_origin = url::Origin::Create(default_port); + EXPECT_FALSE(default_port_origin.IsSameOriginWith(resolved_origin)); +} + +class GURLTestTraits { + public: + using UrlType = GURL; + + static UrlType CreateUrlFromString(base::StringPiece s) { return GURL(s); } + static bool IsAboutBlank(const UrlType& url) { return url.IsAboutBlank(); } + static bool IsAboutSrcdoc(const UrlType& url) { return url.IsAboutSrcdoc(); } + + // Only static members. + GURLTestTraits() = delete; +}; + +INSTANTIATE_TYPED_TEST_SUITE_P(GURL, AbstractUrlTest, GURLTestTraits); + +} // namespace url diff --git a/ipc/BUILD.gn b/ipc/BUILD.gn new file mode 100644 index 00000000000..d7801af54d7 --- /dev/null +++ b/ipc/BUILD.gn @@ -0,0 +1,38 @@ +# Copyright 2016 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import("//testing/test.gni") + +component("url_ipc") { + sources = [ + "url_ipc_export.h", + "url_param_traits.cc", + "url_param_traits.h", + ] + + defines = [ "URL_IPC_IMPLEMENTATION" ] + + public_deps = [ + "//ipc", + "//url", + ] + deps = [ "//base" ] +} + +# IPC unit tests aren't build on iOS. +if (!is_ios) { + source_set("url_ipc_unittests") { + testonly = true + + sources = [ "url_param_traits_unittest.cc" ] + + deps = [ + ":url_ipc", + "//base", + "//ipc:test_support", + "//testing/gtest", + "//url:url", + ] + } +} diff --git a/ipc/OWNERS b/ipc/OWNERS new file mode 100644 index 00000000000..146c3c3cd62 --- /dev/null +++ b/ipc/OWNERS @@ -0,0 +1,2 @@ +per-file *_param_traits*.*=set noparent +per-file *_param_traits*.*=file://ipc/SECURITY_OWNERS diff --git a/ipc/url_ipc_export.h b/ipc/url_ipc_export.h new file mode 100644 index 00000000000..ca500ab0c90 --- /dev/null +++ b/ipc/url_ipc_export.h @@ -0,0 +1,29 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_IPC_URL_IPC_EXPORT_H_ +#define URL_IPC_URL_IPC_EXPORT_H_ + +#if defined(COMPONENT_BUILD) +#if defined(WIN32) + +#if defined(URL_IPC_IMPLEMENTATION) +#define URL_IPC_EXPORT __declspec(dllexport) +#else +#define URL_IPC_EXPORT __declspec(dllimport) +#endif // defined(URL_IPC_IMPLEMENTATION) + +#else // defined(WIN32) +#if defined(URL_IPC_IMPLEMENTATION) +#define URL_IPC_EXPORT __attribute__((visibility("default"))) +#else +#define URL_IPC_EXPORT +#endif +#endif + +#else // defined(COMPONENT_BUILD) +#define URL_IPC_EXPORT +#endif + +#endif // URL_IPC_URL_IPC_EXPORT_H_ diff --git a/ipc/url_param_traits.cc b/ipc/url_param_traits.cc new file mode 100644 index 00000000000..6999c82d586 --- /dev/null +++ b/ipc/url_param_traits.cc @@ -0,0 +1,56 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/ipc/url_param_traits.h" + +#include + +#include "base/pickle.h" +#include "url/gurl.h" +#include "url/url_constants.h" + +namespace IPC { + +void ParamTraits::Write(base::Pickle* m, const GURL& p) { + if (p.possibly_invalid_spec().length() > url::kMaxURLChars) { + m->WriteString(std::string()); + return; + } + + // Beware of print-parse inconsistency which would change an invalid + // URL into a valid one. Ideally, the message would contain this flag + // so that the read side could make the check, but performing it here + // avoids changing the on-the-wire representation of such a fundamental + // type as GURL. See https://crbug.com/166486 for additional work in + // this area. + if (!p.is_valid()) { + m->WriteString(std::string()); + return; + } + + m->WriteString(p.possibly_invalid_spec()); + // TODO(brettw) bug 684583: Add encoding for query params. +} + +bool ParamTraits::Read(const base::Pickle* m, + base::PickleIterator* iter, + GURL* p) { + std::string s; + if (!iter->ReadString(&s) || s.length() > url::kMaxURLChars) { + *p = GURL(); + return false; + } + *p = GURL(s); + if (!s.empty() && !p->is_valid()) { + *p = GURL(); + return false; + } + return true; +} + +void ParamTraits::Log(const GURL& p, std::string* l) { + l->append(p.spec()); +} + +} // namespace IPC diff --git a/ipc/url_param_traits.h b/ipc/url_param_traits.h new file mode 100644 index 00000000000..cdb57242402 --- /dev/null +++ b/ipc/url_param_traits.h @@ -0,0 +1,33 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_IPC_URL_PARAM_TRAITS_H_ +#define URL_IPC_URL_PARAM_TRAITS_H_ + +#include + +#include "ipc/ipc_param_traits.h" +#include "url/gurl.h" +#include "url/ipc/url_ipc_export.h" + +namespace base { +class Pickle; +class PickleIterator; +} // namespace base + +namespace IPC { + +template <> +struct URL_IPC_EXPORT ParamTraits { + typedef GURL param_type; + static void Write(base::Pickle* m, const param_type& p); + static bool Read(const base::Pickle* m, + base::PickleIterator* iter, + param_type* p); + static void Log(const param_type& p, std::string* l); +}; + +} // namespace IPC + +#endif // URL_IPC_URL_PARAM_TRAITS_H_ diff --git a/ipc/url_param_traits_unittest.cc b/ipc/url_param_traits_unittest.cc new file mode 100644 index 00000000000..78bed35ed23 --- /dev/null +++ b/ipc/url_param_traits_unittest.cc @@ -0,0 +1,159 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "ipc/ipc_message.h" +#include "ipc/ipc_message_utils.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/ipc/url_param_traits.h" + +namespace { + +GURL BounceUrl(const GURL& input) { + IPC::Message msg(1, 2, IPC::Message::PRIORITY_NORMAL); + IPC::ParamTraits::Write(&msg, input); + + GURL output; + base::PickleIterator iter(msg); + EXPECT_TRUE(IPC::ParamTraits::Read(&msg, &iter, &output)); + + return output; +} + +void ExpectSerializationRoundtrips(const GURL& input) { + SCOPED_TRACE(testing::Message() + << "Input GURL: " << input.possibly_invalid_spec()); + GURL output = BounceUrl(input); + + // We want to test each component individually to make sure its range was + // correctly serialized and deserialized, not just the spec. + EXPECT_EQ(input.possibly_invalid_spec(), output.possibly_invalid_spec()); + EXPECT_EQ(input.is_valid(), output.is_valid()); + EXPECT_EQ(input.scheme(), output.scheme()); + EXPECT_EQ(input.username(), output.username()); + EXPECT_EQ(input.password(), output.password()); + EXPECT_EQ(input.host(), output.host()); + EXPECT_EQ(input.port(), output.port()); + EXPECT_EQ(input.path(), output.path()); + EXPECT_EQ(input.query(), output.query()); + EXPECT_EQ(input.ref(), output.ref()); +} + +} // namespace + +// Tests that serialize/deserialize correctly understand each other. +TEST(IPCMessageTest, SerializeGurl_Basic) { + const char* serialize_cases[] = { + "http://www.google.com/", + "http://user:pass@host.com:888/foo;bar?baz#nop", + }; + + for (const char* test_input : serialize_cases) { + SCOPED_TRACE(testing::Message() << "Test input: " << test_input); + GURL input(test_input); + ExpectSerializationRoundtrips(input); + } +} + +// Test of an excessively long GURL. +TEST(IPCMessageTest, SerializeGurl_ExcessivelyLong) { + const std::string url = + std::string("http://example.org/").append(url::kMaxURLChars + 1, 'a'); + GURL input(url.c_str()); + GURL output = BounceUrl(input); + EXPECT_TRUE(output.is_empty()); +} + +// Test of an invalid GURL. +TEST(IPCMessageTest, SerializeGurl_InvalidUrl) { + IPC::Message msg; + msg.WriteString("#inva://idurl/"); + GURL output; + base::PickleIterator iter(msg); + EXPECT_FALSE(IPC::ParamTraits::Read(&msg, &iter, &output)); +} + +// Test of a corrupt deserialization input. +TEST(IPCMessageTest, SerializeGurl_CorruptPayload) { + IPC::Message msg(1, 2, IPC::Message::PRIORITY_NORMAL); + msg.WriteInt(99); + GURL output; + base::PickleIterator iter(msg); + EXPECT_FALSE(IPC::ParamTraits::Read(&msg, &iter, &output)); +} + +// Test for the GURL testcase based on https://crbug.com/1214098 (which in turn +// was based on ContentSecurityPolicyBrowserTest.FileURLs). +TEST(IPCMessageTest, SerializeGurl_WindowsDriveInPathReplacement) { + { + // #1: Try creating a file URL with a non-empty hostname. + GURL url_without_windows_drive_letter("file://hostname/"); + EXPECT_EQ("/", url_without_windows_drive_letter.path()); + EXPECT_EQ("hostname", url_without_windows_drive_letter.host()); + ExpectSerializationRoundtrips(url_without_windows_drive_letter); + } + + { + // #2: Use GURL::Replacement to create a GURL with 1) a path that starts + // with a Windows drive letter and 2) has a non-empty hostname (inherited + // from `url_without_windows_drive_letter` above). This used to not go + // through the DoParseUNC path that normally strips the hostname (for more + // details, see https://crbug.com/1214098#c4). + GURL::Replacements repl; + const std::string kNewPath = "/C:/dir/file.txt"; + repl.SetPathStr(kNewPath); + GURL url_made_with_replace_components = + GURL("file://hostname/").ReplaceComponents(repl); + + EXPECT_EQ(kNewPath, url_made_with_replace_components.path()); + EXPECT_EQ("hostname", url_made_with_replace_components.host()); + EXPECT_EQ("file://hostname/C:/dir/file.txt", + url_made_with_replace_components.spec()); + // This is the MAIN VERIFICATION in this test. This used to fail on Windows, + // see https://crbug.com/1214098. + ExpectSerializationRoundtrips(url_made_with_replace_components); + } + + { + // #3: Try to create a URL with a Windows drive letter and a non-empty + // hostname directly. + GURL url_created_directly("file://hostname/C:/dir/file.txt"); + EXPECT_EQ("/C:/dir/file.txt", url_created_directly.path()); + EXPECT_EQ("hostname", url_created_directly.host()); + EXPECT_EQ("file://hostname/C:/dir/file.txt", url_created_directly.spec()); + ExpectSerializationRoundtrips(url_created_directly); + + // The URL created directly and the URL created through ReplaceComponents + // should be the same. + GURL::Replacements repl; + const std::string kNewPath = "/C:/dir/file.txt"; + repl.SetPathStr(kNewPath); + GURL url_made_with_replace_components = + GURL("file://hostname/").ReplaceComponents(repl); + EXPECT_EQ(url_created_directly.spec(), + url_made_with_replace_components.spec()); + } + + { + // #4: Try to create a URL with a Windows drive letter and "localhost" as + // hostname directly. + GURL url_created_directly("file://localhost/C:/dir/file.txt"); + EXPECT_EQ("/C:/dir/file.txt", url_created_directly.path()); + EXPECT_EQ("", url_created_directly.host()); + EXPECT_EQ("file:///C:/dir/file.txt", url_created_directly.spec()); + ExpectSerializationRoundtrips(url_created_directly); + + // The URL created directly and the URL created through ReplaceComponents + // should be the same. + GURL::Replacements repl; + const std::string kNewPath = "/C:/dir/file.txt"; + repl.SetPathStr(kNewPath); + GURL url_made_with_replace_components = + GURL("file://localhost/").ReplaceComponents(repl); + EXPECT_EQ(url_created_directly.spec(), + url_made_with_replace_components.spec()); + } +} diff --git a/mojom/BUILD.gn b/mojom/BUILD.gn new file mode 100644 index 00000000000..a936a432bb3 --- /dev/null +++ b/mojom/BUILD.gn @@ -0,0 +1,141 @@ +# Copyright 2016 The Chromium Authors +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import("//mojo/public/tools/bindings/mojom.gni") + +mojom("url_mojom_gurl") { + generate_java = true + sources = [ "url.mojom" ] + + cpp_typemaps = [ + { + types = [ + { + mojom = "url.mojom.Url" + cpp = "::GURL" + }, + ] + traits_headers = [ "//url/mojom/url_gurl_mojom_traits.h" ] + traits_public_deps = [ + ":mojom_traits", + "//url", + ] + }, + ] + + blink_cpp_typemaps = [ + { + types = [ + { + mojom = "url.mojom.Url" + cpp = "::blink::KURL" + force_serialize = true + }, + ] + traits_headers = [ + "//third_party/blink/renderer/platform/mojo/kurl_mojom_traits.h", + "//third_party/blink/renderer/platform/weborigin/kurl_hash.h", + ] + traits_public_deps = [ "//url" ] + }, + ] + + webui_module_path = "chrome://resources/mojo/url/mojom" +} + +mojom("url_mojom_origin") { + generate_java = true + sources = [ "origin.mojom" ] + + public_deps = [ + ":url_mojom_gurl", + "//mojo/public/mojom/base", + ] + + check_includes_blink = false + + cpp_typemaps = [ + { + types = [ + { + mojom = "url.mojom.Origin" + cpp = "::url::Origin" + }, + ] + traits_headers = [ "//url/mojom/origin_mojom_traits.h" ] + traits_public_deps = [ + ":mojom_traits", + "//url", + ] + }, + ] + + blink_cpp_typemaps = [ + { + types = [ + { + mojom = "url.mojom.Origin" + cpp = "::scoped_refptr" + nullable_is_same_type = true + }, + ] + traits_headers = [ "//third_party/blink/renderer/platform/mojo/security_origin_mojom_traits.h" ] + traits_public_deps = [ "//url" ] + }, + ] + + webui_module_path = "chrome://resources/mojo/url/mojom" +} + +mojom("url_mojom_scheme_host_port") { + generate_java = true + sources = [ "scheme_host_port.mojom" ] + + cpp_typemaps = [ + { + types = [ + { + mojom = "url.mojom.SchemeHostPort" + cpp = "::url::SchemeHostPort" + }, + ] + traits_headers = [ "//url/mojom/scheme_host_port_mojom_traits.h" ] + traits_public_deps = [ "//url" ] + }, + ] +} + +mojom("test_url_mojom_gurl") { + sources = [ "url_test.mojom" ] + + public_deps = [ + ":url_mojom_gurl", + ":url_mojom_origin", + ":url_mojom_scheme_host_port", + ] +} + +component("mojom_traits") { + output_name = "url_mojom_traits" + + sources = [ + "origin_mojom_traits.cc", + "origin_mojom_traits.h", + "scheme_host_port_mojom_traits.cc", + "scheme_host_port_mojom_traits.h", + "url_gurl_mojom_traits.cc", + "url_gurl_mojom_traits.h", + ] + + defines = [ "IS_URL_MOJOM_TRAITS_IMPL" ] + + public_deps = [ + ":url_mojom_gurl_shared", + ":url_mojom_origin_shared", + ":url_mojom_scheme_host_port", + "//base", + "//mojo/public/cpp/base:shared_typemap_traits", + "//url", + ] +} diff --git a/mojom/DEPS b/mojom/DEPS new file mode 100644 index 00000000000..093b1d9fde5 --- /dev/null +++ b/mojom/DEPS @@ -0,0 +1,3 @@ +include_rules = [ + "+mojo/public/cpp", +] diff --git a/mojom/DIR_METADATA b/mojom/DIR_METADATA new file mode 100644 index 00000000000..c080aa16485 --- /dev/null +++ b/mojom/DIR_METADATA @@ -0,0 +1,11 @@ +# Metadata information for this directory. +# +# For more information on DIR_METADATA files, see: +# https://source.chromium.org/chromium/infra/infra/+/main:go/src/infra/tools/dirmd/README.md +# +# For the schema of this file, see Metadata message: +# https://source.chromium.org/chromium/infra/infra/+/main:go/src/infra/tools/dirmd/proto/dir_metadata.proto + +monorail { + component: "Internals>Mojo" +} \ No newline at end of file diff --git a/mojom/OWNERS b/mojom/OWNERS new file mode 100644 index 00000000000..1feb5149750 --- /dev/null +++ b/mojom/OWNERS @@ -0,0 +1,4 @@ +per-file *.mojom=set noparent +per-file *.mojom=file://ipc/SECURITY_OWNERS +per-file *_mojom_traits*.*=set noparent +per-file *_mojom_traits*.*=file://ipc/SECURITY_OWNERS diff --git a/mojom/origin.mojom b/mojom/origin.mojom new file mode 100644 index 00000000000..94b6e2d587c --- /dev/null +++ b/mojom/origin.mojom @@ -0,0 +1,19 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +[JavaPackage="org.chromium.url.internal.mojom"] +module url.mojom; + +import "mojo/public/mojom/base/unguessable_token.mojom"; + +struct Origin { + string scheme; + string host; + uint16 port; + + // When a nonce is provided, this origin is opaque. The scheme/host/port do + // not need to be valid, but if they are, they identify the tuple origin + // from which this opaque origin is derived. + mojo_base.mojom.UnguessableToken? nonce_if_opaque; +}; diff --git a/mojom/origin_mojom_traits.cc b/mojom/origin_mojom_traits.cc new file mode 100644 index 00000000000..9e8475ac53e --- /dev/null +++ b/mojom/origin_mojom_traits.cc @@ -0,0 +1,34 @@ +// Copyright 2020 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/mojom/origin_mojom_traits.h" + +#include "base/strings/string_piece.h" + +namespace mojo { + +// static +bool StructTraits::Read( + url::mojom::OriginDataView data, + url::Origin* out) { + base::StringPiece scheme, host; + absl::optional nonce_if_opaque; + if (!data.ReadScheme(&scheme) || !data.ReadHost(&host) || + !data.ReadNonceIfOpaque(&nonce_if_opaque)) + return false; + + absl::optional creation_result = + nonce_if_opaque + ? url::Origin::UnsafelyCreateOpaqueOriginWithoutNormalization( + scheme, host, data.port(), url::Origin::Nonce(*nonce_if_opaque)) + : url::Origin::UnsafelyCreateTupleOriginWithoutNormalization( + scheme, host, data.port()); + if (!creation_result) + return false; + + *out = std::move(creation_result.value()); + return true; +} + +} // namespace mojo diff --git a/mojom/origin_mojom_traits.h b/mojom/origin_mojom_traits.h new file mode 100644 index 00000000000..0d3cbb276cb --- /dev/null +++ b/mojom/origin_mojom_traits.h @@ -0,0 +1,39 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_MOJOM_ORIGIN_MOJOM_TRAITS_H_ +#define URL_MOJOM_ORIGIN_MOJOM_TRAITS_H_ + +#include "base/component_export.h" +#include "base/unguessable_token.h" +#include "mojo/public/cpp/base/unguessable_token_mojom_traits.h" +#include "mojo/public/cpp/bindings/optional_as_pointer.h" +#include "third_party/abseil-cpp/absl/types/optional.h" +#include "url/mojom/origin.mojom-shared.h" +#include "url/origin.h" + +namespace mojo { + +template <> +struct COMPONENT_EXPORT(URL_MOJOM_TRAITS) + StructTraits { + static const std::string& scheme(const url::Origin& r) { + return r.GetTupleOrPrecursorTupleIfOpaque().scheme(); + } + static const std::string& host(const url::Origin& r) { + return r.GetTupleOrPrecursorTupleIfOpaque().host(); + } + static uint16_t port(const url::Origin& r) { + return r.GetTupleOrPrecursorTupleIfOpaque().port(); + } + static mojo::OptionalAsPointer nonce_if_opaque( + const url::Origin& r) { + return mojo::MakeOptionalAsPointer(r.GetNonceForSerialization()); + } + static bool Read(url::mojom::OriginDataView data, url::Origin* out); +}; + +} // namespace mojo + +#endif // URL_MOJOM_ORIGIN_MOJOM_TRAITS_H_ diff --git a/mojom/scheme_host_port.mojom b/mojom/scheme_host_port.mojom new file mode 100644 index 00000000000..79f37221d84 --- /dev/null +++ b/mojom/scheme_host_port.mojom @@ -0,0 +1,13 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +module url.mojom; + +// See url::SchemeHostPort for details, and what differentiates this from an +// origin. +struct SchemeHostPort { + string scheme; + string host; + uint16 port; +}; diff --git a/mojom/scheme_host_port_mojom_traits.cc b/mojom/scheme_host_port_mojom_traits.cc new file mode 100644 index 00000000000..63f6af4c101 --- /dev/null +++ b/mojom/scheme_host_port_mojom_traits.cc @@ -0,0 +1,27 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/mojom/scheme_host_port_mojom_traits.h" + +#include "base/strings/string_piece.h" +#include "url/mojom/scheme_host_port.mojom-shared.h" +#include "url/scheme_host_port.h" + +namespace mojo { + +// static +bool StructTraits:: + Read(url::mojom::SchemeHostPortDataView data, url::SchemeHostPort* out) { + base::StringPiece scheme, host; + if (!data.ReadScheme(&scheme) || !data.ReadHost(&host)) + return false; + + *out = url::SchemeHostPort(scheme, host, data.port()); + + // Consider it an error if the output SchemeHostPort is not valid, but + // non-empty values were received over Mojo. + return out->IsValid() || (scheme.empty() && host.empty() && data.port() == 0); +} + +} // namespace mojo diff --git a/mojom/scheme_host_port_mojom_traits.h b/mojom/scheme_host_port_mojom_traits.h new file mode 100644 index 00000000000..e91ae3a0c29 --- /dev/null +++ b/mojom/scheme_host_port_mojom_traits.h @@ -0,0 +1,30 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_MOJOM_SCHEME_HOST_PORT_MOJOM_TRAITS_H_ +#define URL_MOJOM_SCHEME_HOST_PORT_MOJOM_TRAITS_H_ + +#include "base/component_export.h" +#include "url/mojom/scheme_host_port.mojom-shared.h" +#include "url/scheme_host_port.h" + +namespace mojo { + +template <> +struct COMPONENT_EXPORT(URL_MOJOM_TRAITS) + StructTraits { + static const std::string& scheme(const url::SchemeHostPort& r) { + return r.scheme(); + } + static const std::string& host(const url::SchemeHostPort& r) { + return r.host(); + } + static uint16_t port(const url::SchemeHostPort& r) { return r.port(); } + static bool Read(url::mojom::SchemeHostPortDataView data, + url::SchemeHostPort* out); +}; + +} // namespace mojo + +#endif // URL_MOJOM_SCHEME_HOST_PORT_MOJOM_TRAITS_H_ diff --git a/mojom/scheme_host_port_mojom_traits_unittest.cc b/mojom/scheme_host_port_mojom_traits_unittest.cc new file mode 100644 index 00000000000..7efd5d7786e --- /dev/null +++ b/mojom/scheme_host_port_mojom_traits_unittest.cc @@ -0,0 +1,36 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/mojom/scheme_host_port_mojom_traits.h" + +#include + +#include "mojo/public/cpp/test_support/test_utils.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/mojom/scheme_host_port.mojom.h" +#include "url/scheme_host_port.h" + +namespace url { + +namespace { + +void TestRoundTrip(const url::SchemeHostPort& in) { + url::SchemeHostPort result; + ASSERT_TRUE( + mojo::test::SerializeAndDeserialize(in, result)) + << in.Serialize(); + EXPECT_EQ(in, result) << "Expected " << in.Serialize() << ", but got " + << result.Serialize(); +} + +} // namespace + +TEST(SchemeHostPortMojomTraitsTest, RoundTrip) { + TestRoundTrip(url::SchemeHostPort()); + TestRoundTrip(url::SchemeHostPort("http", "test", 80)); + TestRoundTrip(url::SchemeHostPort("https", "foo.test", 443)); + TestRoundTrip(url::SchemeHostPort("file", "", 0)); +} + +} // namespace url diff --git a/mojom/url.mojom b/mojom/url.mojom new file mode 100644 index 00000000000..e5fbee620b8 --- /dev/null +++ b/mojom/url.mojom @@ -0,0 +1,13 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +module url.mojom; + +// 2 * 1024 * 1024 +const uint32 kMaxURLChars = 2097152; + +[Stable] +struct Url { + string url; +}; diff --git a/mojom/url_gurl_mojom_traits.cc b/mojom/url_gurl_mojom_traits.cc new file mode 100644 index 00000000000..97b301a0ab2 --- /dev/null +++ b/mojom/url_gurl_mojom_traits.cc @@ -0,0 +1,40 @@ +// Copyright 2020 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/mojom/url_gurl_mojom_traits.h" + +#include "url/url_constants.h" + +namespace mojo { + +// static +base::StringPiece StructTraits::url( + const GURL& r) { + if (r.possibly_invalid_spec().length() > url::kMaxURLChars || !r.is_valid()) { + return base::StringPiece(); + } + + return base::StringPiece(r.possibly_invalid_spec().c_str(), + r.possibly_invalid_spec().length()); +} + +// static +bool StructTraits::Read( + url::mojom::UrlDataView data, + GURL* out) { + base::StringPiece url_string; + if (!data.ReadUrl(&url_string)) + return false; + + if (url_string.length() > url::kMaxURLChars) + return false; + + *out = GURL(url_string); + if (!url_string.empty() && !out->is_valid()) + return false; + + return true; +} + +} // namespace mojo diff --git a/mojom/url_gurl_mojom_traits.h b/mojom/url_gurl_mojom_traits.h new file mode 100644 index 00000000000..19ac049c8a4 --- /dev/null +++ b/mojom/url_gurl_mojom_traits.h @@ -0,0 +1,25 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_MOJOM_URL_GURL_MOJOM_TRAITS_H_ +#define URL_MOJOM_URL_GURL_MOJOM_TRAITS_H_ + +#include "base/component_export.h" +#include "base/strings/string_piece.h" +#include "mojo/public/cpp/bindings/struct_traits.h" +#include "url/gurl.h" +#include "url/mojom/url.mojom-shared.h" + +namespace mojo { + +template <> +struct COMPONENT_EXPORT(URL_MOJOM_TRAITS) + StructTraits { + static base::StringPiece url(const GURL& r); + static bool Read(url::mojom::UrlDataView data, GURL* out); +}; + +} // namespace mojo + +#endif // URL_MOJOM_URL_GURL_MOJOM_TRAITS_H_ diff --git a/mojom/url_gurl_mojom_traits_unittest.cc b/mojom/url_gurl_mojom_traits_unittest.cc new file mode 100644 index 00000000000..48968d24bfd --- /dev/null +++ b/mojom/url_gurl_mojom_traits_unittest.cc @@ -0,0 +1,209 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "base/test/task_environment.h" +#include "mojo/public/cpp/bindings/pending_receiver.h" +#include "mojo/public/cpp/bindings/receiver.h" +#include "mojo/public/cpp/bindings/remote.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/mojom/url_test.mojom.h" + +namespace url { + +class UrlTestImpl : public mojom::UrlTest { + public: + explicit UrlTestImpl(mojo::PendingReceiver receiver) + : receiver_(this, std::move(receiver)) {} + + // UrlTest: + void BounceUrl(const GURL& in, BounceUrlCallback callback) override { + std::move(callback).Run(in); + } + + void BounceOrigin(const Origin& in, BounceOriginCallback callback) override { + std::move(callback).Run(in); + } + + private: + mojo::Receiver receiver_; +}; + +class MojoGURLStructTraitsTest : public ::testing::Test { + public: + MojoGURLStructTraitsTest() + : url_test_impl_(url_test_remote_.BindNewPipeAndPassReceiver()) {} + + GURL BounceUrl(const GURL& input) { + GURL output; + EXPECT_TRUE(url_test_remote_->BounceUrl(input, &output)); + return output; + } + + void ExpectSerializationRoundtrips(const GURL& input) { + SCOPED_TRACE(testing::Message() + << "Input GURL: " << input.possibly_invalid_spec()); + GURL output = BounceUrl(input); + + // We want to test each component individually to make sure its range was + // correctly serialized and deserialized, not just the spec. + EXPECT_EQ(input.possibly_invalid_spec(), output.possibly_invalid_spec()); + EXPECT_EQ(input.is_valid(), output.is_valid()); + EXPECT_EQ(input.scheme(), output.scheme()); + EXPECT_EQ(input.username(), output.username()); + EXPECT_EQ(input.password(), output.password()); + EXPECT_EQ(input.host(), output.host()); + EXPECT_EQ(input.port(), output.port()); + EXPECT_EQ(input.path(), output.path()); + EXPECT_EQ(input.query(), output.query()); + EXPECT_EQ(input.ref(), output.ref()); + } + + Origin BounceOrigin(const Origin& input) { + Origin output; + EXPECT_TRUE(url_test_remote_->BounceOrigin(input, &output)); + return output; + } + + private: + base::test::SingleThreadTaskEnvironment task_environment; + mojo::Remote url_test_remote_; + UrlTestImpl url_test_impl_; +}; + +// Mojo version of chrome IPC test in url/ipc/url_param_traits_unittest.cc. +TEST_F(MojoGURLStructTraitsTest, Basic) { + const char* serialize_cases[] = { + "http://www.google.com/", + "http://user:pass@host.com:888/foo;bar?baz#nop", + }; + + for (const char* test_input : serialize_cases) { + SCOPED_TRACE(testing::Message() << "Test input: " << test_input); + GURL input(test_input); + ExpectSerializationRoundtrips(input); + } +} + +// Test of an excessively long GURL. +TEST_F(MojoGURLStructTraitsTest, ExcessivelyLongUrl) { + const std::string url = + std::string("http://example.org/").append(kMaxURLChars + 1, 'a'); + GURL input(url.c_str()); + GURL output = BounceUrl(input); + EXPECT_TRUE(output.is_empty()); +} + +// Test for the GURL testcase based on https://crbug.com/1214098 (which in turn +// was based on ContentSecurityPolicyBrowserTest.FileURLs). +TEST_F(MojoGURLStructTraitsTest, WindowsDriveInPathReplacement) { + { + // #1: Try creating a file URL with a non-empty hostname. + GURL url_without_windows_drive_letter("file://hostname/"); + EXPECT_EQ("/", url_without_windows_drive_letter.path()); + EXPECT_EQ("hostname", url_without_windows_drive_letter.host()); + ExpectSerializationRoundtrips(url_without_windows_drive_letter); + } + + { + // #2: Use GURL::Replacement to create a GURL with 1) a path that starts + // with a Windows drive letter and 2) has a non-empty hostname (inherited + // from `url_without_windows_drive_letter` above). This used to not go + // through the DoParseUNC path that normally strips the hostname (for more + // details, see https://crbug.com/1214098#c4). + GURL::Replacements repl; + const std::string kNewPath = "/C:/dir/file.txt"; + repl.SetPathStr(kNewPath); + GURL url_made_with_replace_components = + GURL("file://hostname/").ReplaceComponents(repl); + + EXPECT_EQ(kNewPath, url_made_with_replace_components.path()); + EXPECT_EQ("hostname", url_made_with_replace_components.host()); + EXPECT_EQ("file://hostname/C:/dir/file.txt", + url_made_with_replace_components.spec()); + // This is the MAIN VERIFICATION in this test. This used to fail on Windows, + // see https://crbug.com/1214098. + ExpectSerializationRoundtrips(url_made_with_replace_components); + } + + { + // #3: Try to create a URL with a Windows drive letter and a non-empty + // hostname directly. + GURL url_created_directly("file://hostname/C:/dir/file.txt"); + EXPECT_EQ("/C:/dir/file.txt", url_created_directly.path()); + EXPECT_EQ("hostname", url_created_directly.host()); + EXPECT_EQ("file://hostname/C:/dir/file.txt", url_created_directly.spec()); + ExpectSerializationRoundtrips(url_created_directly); + + // The URL created directly and the URL created through ReplaceComponents + // should be the same. + GURL::Replacements repl; + const std::string kNewPath = "/C:/dir/file.txt"; + repl.SetPathStr(kNewPath); + GURL url_made_with_replace_components = + GURL("file://hostname/").ReplaceComponents(repl); + EXPECT_EQ(url_created_directly.spec(), + url_made_with_replace_components.spec()); + } + + { + // #4: Try to create a URL with a Windows drive letter and "localhost" as + // hostname directly. + GURL url_created_directly("file://localhost/C:/dir/file.txt"); + EXPECT_EQ("/C:/dir/file.txt", url_created_directly.path()); + EXPECT_EQ("", url_created_directly.host()); + EXPECT_EQ("file:///C:/dir/file.txt", url_created_directly.spec()); + ExpectSerializationRoundtrips(url_created_directly); + + // The URL created directly and the URL created through ReplaceComponents + // should be the same. + GURL::Replacements repl; + const std::string kNewPath = "/C:/dir/file.txt"; + repl.SetPathStr(kNewPath); + GURL url_made_with_replace_components = + GURL("file://localhost/").ReplaceComponents(repl); + EXPECT_EQ(url_created_directly.spec(), + url_made_with_replace_components.spec()); + } +} + +// Test of basic Origin serialization. +TEST_F(MojoGURLStructTraitsTest, OriginSerialization) { + Origin non_unique = Origin::UnsafelyCreateTupleOriginWithoutNormalization( + "http", "www.google.com", 80) + .value(); + Origin output = BounceOrigin(non_unique); + EXPECT_EQ(non_unique, output); + EXPECT_FALSE(output.opaque()); + + Origin unique1; + Origin unique2 = non_unique.DeriveNewOpaqueOrigin(); + EXPECT_NE(unique1, unique2); + EXPECT_NE(unique2, unique1); + EXPECT_NE(unique2, non_unique); + output = BounceOrigin(unique1); + EXPECT_TRUE(output.opaque()); + EXPECT_EQ(unique1, output); + Origin output2 = BounceOrigin(unique2); + EXPECT_EQ(unique2, output2); + EXPECT_NE(unique2, output); + EXPECT_NE(unique1, output2); + + Origin normalized = + Origin::CreateFromNormalizedTuple("http", "www.google.com", 80); + EXPECT_EQ(normalized, non_unique); + output = BounceOrigin(normalized); + EXPECT_EQ(normalized, output); + EXPECT_EQ(non_unique, output); + EXPECT_FALSE(output.opaque()); +} + +// Test that the "kMaxURLChars" values are the same in url.mojom and +// url_constants.cc. +TEST_F(MojoGURLStructTraitsTest, TestMaxURLChars) { + EXPECT_EQ(kMaxURLChars, mojom::kMaxURLChars); +} + +} // namespace url diff --git a/mojom/url_test.mojom b/mojom/url_test.mojom new file mode 100644 index 00000000000..4dc00deed6b --- /dev/null +++ b/mojom/url_test.mojom @@ -0,0 +1,16 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +module url.mojom; + +import "url/mojom/origin.mojom"; +import "url/mojom/url.mojom"; + +interface UrlTest { + [Sync] + BounceUrl(Url in) => (Url out); + + [Sync] + BounceOrigin(Origin in) => (Origin out); +}; diff --git a/origin.cc b/origin.cc new file mode 100644 index 00000000000..38be245a472 --- /dev/null +++ b/origin.cc @@ -0,0 +1,482 @@ +// Copyright 2015 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/origin.h" + +#include + +#include +#include +#include +#include +#include + +#include "base/base64.h" +#include "base/check.h" +#include "base/check_op.h" +#include "base/containers/contains.h" +#include "base/containers/span.h" +#include "base/debug/crash_logging.h" +#include "base/pickle.h" +#include "base/strings/strcat.h" +#include "base/strings/string_piece.h" +#include "base/trace_event/base_tracing.h" +#include "base/unguessable_token.h" +#include "url/gurl.h" +#include "url/scheme_host_port.h" +#include "url/url_constants.h" +#include "url/url_util.h" + +namespace url { + +Origin::Origin() : nonce_(Nonce()) {} + +Origin Origin::Create(const GURL& url) { + if (!url.is_valid()) + return Origin(); + + SchemeHostPort tuple; + + if (url.SchemeIsFileSystem()) { + tuple = SchemeHostPort(*url.inner_url()); + } else if (url.SchemeIsBlob()) { + // If we're dealing with a 'blob:' URL, https://url.spec.whatwg.org/#origin + // defines the origin as the origin of the URL which results from parsing + // the "path", which boils down to everything after the scheme. GURL's + // 'GetContent()' gives us exactly that. + tuple = SchemeHostPort(GURL(url.GetContent())); + } else { + tuple = SchemeHostPort(url); + + // It's SchemeHostPort's responsibility to filter out unrecognized schemes; + // sanity check that this is happening. + DCHECK(!tuple.IsValid() || url.IsStandard() || + base::Contains(GetLocalSchemes(), url.scheme_piece()) || + AllowNonStandardSchemesForAndroidWebView()); + } + + if (!tuple.IsValid()) + return Origin(); + return Origin(std::move(tuple)); +} + +Origin Origin::Resolve(const GURL& url, const Origin& base_origin) { + if (url.SchemeIs(kAboutScheme) || url.is_empty()) + return base_origin; + Origin result = Origin::Create(url); + if (!result.opaque()) + return result; + return base_origin.DeriveNewOpaqueOrigin(); +} + +Origin::Origin(const Origin&) = default; +Origin& Origin::operator=(const Origin&) = default; +Origin::Origin(Origin&&) noexcept = default; +Origin& Origin::operator=(Origin&&) noexcept = default; +Origin::~Origin() = default; + +// static +absl::optional Origin::UnsafelyCreateTupleOriginWithoutNormalization( + base::StringPiece scheme, + base::StringPiece host, + uint16_t port) { + SchemeHostPort tuple(std::string(scheme), std::string(host), port, + SchemeHostPort::CHECK_CANONICALIZATION); + if (!tuple.IsValid()) + return absl::nullopt; + return Origin(std::move(tuple)); +} + +// static +absl::optional Origin::UnsafelyCreateOpaqueOriginWithoutNormalization( + base::StringPiece precursor_scheme, + base::StringPiece precursor_host, + uint16_t precursor_port, + const Origin::Nonce& nonce) { + SchemeHostPort precursor(std::string(precursor_scheme), + std::string(precursor_host), precursor_port, + SchemeHostPort::CHECK_CANONICALIZATION); + // For opaque origins, it is okay for the SchemeHostPort to be invalid; + // however, this should only arise when the arguments indicate the + // canonical representation of the invalid SchemeHostPort. + if (!precursor.IsValid() && + !(precursor_scheme.empty() && precursor_host.empty() && + precursor_port == 0)) { + return absl::nullopt; + } + return Origin(std::move(nonce), std::move(precursor)); +} + +// static +Origin Origin::CreateFromNormalizedTuple(std::string scheme, + std::string host, + uint16_t port) { + SchemeHostPort tuple(std::move(scheme), std::move(host), port, + SchemeHostPort::ALREADY_CANONICALIZED); + if (!tuple.IsValid()) + return Origin(); + return Origin(std::move(tuple)); +} + +// static +Origin Origin::CreateOpaqueFromNormalizedPrecursorTuple( + std::string precursor_scheme, + std::string precursor_host, + uint16_t precursor_port, + const Origin::Nonce& nonce) { + SchemeHostPort precursor(std::move(precursor_scheme), + std::move(precursor_host), precursor_port, + SchemeHostPort::ALREADY_CANONICALIZED); + // For opaque origins, it is okay for the SchemeHostPort to be invalid. + return Origin(std::move(nonce), std::move(precursor)); +} + +std::string Origin::Serialize() const { + if (opaque()) + return "null"; + + if (scheme() == kFileScheme) + return "file://"; + + return tuple_.Serialize(); +} + +GURL Origin::GetURL() const { + if (opaque()) + return GURL(); + + if (scheme() == kFileScheme) + return GURL("file:///"); + + return tuple_.GetURL(); +} + +const base::UnguessableToken* Origin::GetNonceForSerialization() const { + return nonce_ ? &nonce_->token() : nullptr; +} + +bool Origin::IsSameOriginWith(const Origin& other) const { + // scheme/host/port must match, even for opaque origins where |tuple_| holds + // the precursor origin. + return std::tie(tuple_, nonce_) == std::tie(other.tuple_, other.nonce_); +} + +bool Origin::IsSameOriginWith(const GURL& url) const { + if (opaque()) + return false; + + // The `url::Origin::Create` call here preserves how IsSameOriginWith was used + // historically, even though in some scenarios it is not clearly correct: + // - Origin of about:blank and about:srcdoc cannot be correctly + // computed/recovered. + // - Ideally passing an invalid `url` would be a caller error (e.g. a DCHECK). + // - The caller intent is not always clear wrt handling the outer-vs-inner + // origins/URLs in blob: and filesystem: schemes. + return IsSameOriginWith(url::Origin::Create(url)); +} + +bool Origin::CanBeDerivedFrom(const GURL& url) const { + DCHECK(url.is_valid()); + + // For "no access" schemes, blink's SecurityOrigin will always create an + // opaque unique one. However, about: scheme is also registered as such but + // does not behave this way, therefore exclude it from this check. + if (base::Contains(url::GetNoAccessSchemes(), url.scheme()) && + !url.SchemeIs(kAboutScheme)) { + // If |this| is not opaque, definitely return false as the expectation + // is for opaque origin. + if (!opaque()) + return false; + + // And if it is unique opaque origin, it definitely is fine. But if there + // is a precursor stored, we should fall through to compare the tuples. + if (!tuple_.IsValid()) + return true; + } + + SchemeHostPort url_tuple; + + // Optimization for the common, success case: Scheme/Host/Port match on the + // precursor, and the URL is standard. Opaqueness does not matter as a tuple + // origin can always create an opaque tuple origin. + if (url.IsStandard()) { + // Note: if extra copies of the scheme and host are undesirable, this check + // can be implemented using StringPiece comparisons, but it has to account + // explicitly checks on port numbers. + if (url.SchemeIsFileSystem()) { + url_tuple = SchemeHostPort(*url.inner_url()); + } else { + url_tuple = SchemeHostPort(url); + } + return url_tuple == tuple_; + + // Blob URLs still contain an inner origin, however it is not accessible + // through inner_url(), therefore it requires specific case to handle it. + } else if (url.SchemeIsBlob()) { + // If |this| doesn't contain any precursor information, it is an unique + // opaque origin. It is valid case, as any browser-initiated navigation + // to about:blank or data: URL will result in a document with such + // origin and it is valid for it to create blob: URLs. + if (!tuple_.IsValid()) + return true; + + url_tuple = SchemeHostPort(GURL(url.GetContent())); + return url_tuple == tuple_; + } + + // At this point, the URL has non-standard scheme. + DCHECK(!url.IsStandard()); + + // All about: URLs (about:blank, about:srcdoc) inherit their origin from + // the context which navigated them, which means that they can be in any + // type of origin. + if (url.SchemeIs(kAboutScheme)) + return true; + + // All data: URLs commit in opaque origins, therefore |this| must be opaque + // if |url| has data: scheme. + if (url.SchemeIs(kDataScheme)) + return opaque(); + + // If |this| does not have valid precursor tuple, it is unique opaque origin, + // which is what we expect non-standard schemes to get. + if (!tuple_.IsValid()) + return true; + + // However, when there is precursor present, the schemes must match. + return url.scheme() == tuple_.scheme(); +} + +bool Origin::DomainIs(base::StringPiece canonical_domain) const { + return !opaque() && url::DomainIs(tuple_.host(), canonical_domain); +} + +bool Origin::operator<(const Origin& other) const { + return std::tie(tuple_, nonce_) < std::tie(other.tuple_, other.nonce_); +} + +Origin Origin::DeriveNewOpaqueOrigin() const { + return Origin(Nonce(), tuple_); +} + +std::string Origin::GetDebugString(bool include_nonce) const { + // Handle non-opaque origins first, as they are simpler. + if (!opaque()) { + std::string out = Serialize(); + if (scheme() == kFileScheme) + base::StrAppend(&out, {" [internally: ", tuple_.Serialize(), "]"}); + return out; + } + + // For opaque origins, log the nonce and precursor as well. Without this, + // EXPECT_EQ failures between opaque origins are nearly impossible to + // understand. + std::string out = base::StrCat({Serialize(), " [internally:"}); + if (include_nonce) { + out += " ("; + if (nonce_->raw_token().is_empty()) + out += "nonce TBD"; + else + out += nonce_->raw_token().ToString(); + out += ")"; + } + if (!tuple_.IsValid()) + base::StrAppend(&out, {" anonymous]"}); + else + base::StrAppend(&out, {" derived from ", tuple_.Serialize(), "]"}); + return out; +} + +Origin::Origin(SchemeHostPort tuple) : tuple_(std::move(tuple)) { + DCHECK(!opaque()); + DCHECK(tuple_.IsValid()); +} + +// Constructs an opaque origin derived from |precursor|. +Origin::Origin(const Nonce& nonce, SchemeHostPort precursor) + : tuple_(std::move(precursor)), nonce_(std::move(nonce)) { + DCHECK(opaque()); + // |precursor| is retained, but not accessible via scheme()/host()/port(). + DCHECK_EQ("", scheme()); + DCHECK_EQ("", host()); + DCHECK_EQ(0U, port()); +} + +absl::optional Origin::SerializeWithNonce() const { + return SerializeWithNonceImpl(); +} + +absl::optional Origin::SerializeWithNonceAndInitIfNeeded() { + GetNonceForSerialization(); + return SerializeWithNonceImpl(); +} + +// The pickle is saved in the following format, in order: +// string - tuple_.GetURL().spec(). +// uint64_t (if opaque) - high bits of nonce if opaque. 0 if not initialized. +// uint64_t (if opaque) - low bits of nonce if opaque. 0 if not initialized. +absl::optional Origin::SerializeWithNonceImpl() const { + if (!opaque() && !tuple_.IsValid()) + return absl::nullopt; + + base::Pickle pickle; + pickle.WriteString(tuple_.Serialize()); + if (opaque() && !nonce_->raw_token().is_empty()) { + pickle.WriteUInt64(nonce_->token().GetHighForSerialization()); + pickle.WriteUInt64(nonce_->token().GetLowForSerialization()); + } else if (opaque()) { + // Nonce hasn't been initialized. + pickle.WriteUInt64(0); + pickle.WriteUInt64(0); + } + + base::span data(static_cast(pickle.data()), + pickle.size()); + // Base64 encode the data to make it nicer to play with. + return base::Base64Encode(data); +} + +// static +absl::optional Origin::Deserialize(const std::string& value) { + std::string data; + if (!base::Base64Decode(value, &data)) + return absl::nullopt; + base::Pickle pickle(reinterpret_cast(&data[0]), data.size()); + base::PickleIterator reader(pickle); + + std::string pickled_url; + if (!reader.ReadString(&pickled_url)) + return absl::nullopt; + GURL url(pickled_url); + + // If only a tuple was serialized, then this origin is not opaque. For opaque + // origins, we expect two uint64's to be left in the pickle. + bool is_opaque = !reader.ReachedEnd(); + + // Opaque origins without a tuple are ok. + if (!is_opaque && !url.is_valid()) + return absl::nullopt; + SchemeHostPort tuple(url); + + // Possible successful early return if the pickled Origin was not opaque. + if (!is_opaque) { + Origin origin(tuple); + if (origin.opaque()) + return absl::nullopt; // Something went horribly wrong. + return origin; + } + + uint64_t nonce_high = 0; + if (!reader.ReadUInt64(&nonce_high)) + return absl::nullopt; + + uint64_t nonce_low = 0; + if (!reader.ReadUInt64(&nonce_low)) + return absl::nullopt; + + absl::optional nonce_token = + base::UnguessableToken::Deserialize(nonce_high, nonce_low); + + Origin::Nonce nonce; + if (nonce_token.has_value()) { + // The serialized nonce wasn't empty, so copy it here. + nonce = Origin::Nonce(nonce_token.value()); + } + Origin origin; + origin.nonce_ = std::move(nonce); + origin.tuple_ = tuple; + return origin; +} + +void Origin::WriteIntoTrace(perfetto::TracedValue context) const { + std::move(context).WriteString(GetDebugString()); +} + +std::ostream& operator<<(std::ostream& out, const url::Origin& origin) { + out << origin.GetDebugString(); + return out; +} + +std::ostream& operator<<(std::ostream& out, const url::Origin::Nonce& nonce) { + // Subtle: don't let logging trigger lazy-generation of the token value. + if (nonce.raw_token().is_empty()) + return (out << "(nonce TBD)"); + else + return (out << nonce.raw_token()); +} + +bool IsSameOriginWith(const GURL& a, const GURL& b) { + return Origin::Create(a).IsSameOriginWith(Origin::Create(b)); +} + +Origin::Nonce::Nonce() = default; +Origin::Nonce::Nonce(const base::UnguessableToken& token) : token_(token) { + CHECK(!token_.is_empty()); +} + +const base::UnguessableToken& Origin::Nonce::token() const { + // Inspecting the value of a nonce triggers lazy-generation. + // TODO(dcheng): UnguessableToken::is_empty should go away -- what sentinel + // value to use instead? + if (token_.is_empty()) + token_ = base::UnguessableToken::Create(); + return token_; +} + +const base::UnguessableToken& Origin::Nonce::raw_token() const { + return token_; +} + +// Copying a Nonce triggers lazy-generation of the token. +Origin::Nonce::Nonce(const Origin::Nonce& other) : token_(other.token()) {} + +Origin::Nonce& Origin::Nonce::operator=(const Origin::Nonce& other) { + // Copying a Nonce triggers lazy-generation of the token. + token_ = other.token(); + return *this; +} + +// Moving a nonce does NOT trigger lazy-generation of the token. +Origin::Nonce::Nonce(Origin::Nonce&& other) noexcept : token_(other.token_) { + other.token_ = base::UnguessableToken(); // Reset |other|. +} + +Origin::Nonce& Origin::Nonce::operator=(Origin::Nonce&& other) noexcept { + token_ = other.token_; + other.token_ = base::UnguessableToken(); // Reset |other|. + return *this; +} + +bool Origin::Nonce::operator<(const Origin::Nonce& other) const { + // When comparing, lazy-generation is required of both tokens, so that an + // ordering is established. + return token() < other.token(); +} + +bool Origin::Nonce::operator==(const Origin::Nonce& other) const { + // Equality testing doesn't actually require that the tokens be generated. + // If the tokens are both zero, equality only holds if they're the same + // object. + return (other.token_ == token_) && !(token_.is_empty() && (&other != this)); +} + +bool Origin::Nonce::operator!=(const Origin::Nonce& other) const { + return !(*this == other); +} + +namespace debug { + +ScopedOriginCrashKey::ScopedOriginCrashKey( + base::debug::CrashKeyString* crash_key, + const url::Origin* value) + : scoped_string_value_( + crash_key, + value ? value->GetDebugString(false /* include_nonce */) + : "nullptr") {} + +ScopedOriginCrashKey::~ScopedOriginCrashKey() = default; + +} // namespace debug + +} // namespace url diff --git a/origin.h b/origin.h new file mode 100644 index 00000000000..a0575338ab8 --- /dev/null +++ b/origin.h @@ -0,0 +1,496 @@ +// Copyright 2015 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_ORIGIN_H_ +#define URL_ORIGIN_H_ + +#include + +#include +#include + +#include "base/component_export.h" +#include "base/debug/alias.h" +#include "base/debug/crash_logging.h" +#include "base/gtest_prod_util.h" +#include "base/strings/string_piece_forward.h" +#include "base/strings/string_util.h" +#include "base/trace_event/base_tracing_forward.h" +#include "base/unguessable_token.h" +#include "build/build_config.h" +#include "build/buildflag.h" +#include "third_party/abseil-cpp/absl/types/optional.h" +#include "url/scheme_host_port.h" + +#if BUILDFLAG(IS_ANDROID) +#include + +namespace base { +namespace android { +template +class ScopedJavaLocalRef; +template +class JavaRef; +} // namespace android +} // namespace base +#endif // BUILDFLAG(IS_ANDROID) + +class GURL; + +namespace blink { +class SecurityOrigin; +class SecurityOriginTest; +class StorageKey; +class StorageKeyTest; +} // namespace blink + +namespace IPC { +template +struct ParamTraits; +} // namespace IPC + +namespace ipc_fuzzer { +template +struct FuzzTraits; +} // namespace ipc_fuzzer + +namespace mojo { +template +struct StructTraits; +struct UrlOriginAdapter; +} // namespace mojo + +namespace net { +class SchemefulSite; +} // namespace net + +namespace url { + +namespace mojom { +class OriginDataView; +} // namespace mojom + +// Per https://html.spec.whatwg.org/multipage/origin.html#origin, an origin is +// either: +// - a tuple origin of (scheme, host, port) as described in RFC 6454. +// - an opaque origin with an internal value, and a memory of the tuple origin +// from which it was derived. +// +// TL;DR: If you need to make a security-relevant decision, use 'url::Origin'. +// If you only need to extract the bits of a URL which are relevant for a +// network connection, use 'url::SchemeHostPort'. +// +// STL;SDR: If you aren't making actual network connections, use 'url::Origin'. +// +// This class ought to be used when code needs to determine if two resources +// are "same-origin", and when a canonical serialization of an origin is +// required. Note that the canonical serialization of an origin *must not* be +// used to determine if two resources are same-origin. +// +// A tuple origin, like 'SchemeHostPort', is composed of a tuple of (scheme, +// host, port), but contains a number of additional concepts which make it +// appropriate for use as a security boundary and access control mechanism +// between contexts. Two tuple origins are same-origin if the tuples are equal. +// A tuple origin may also be re-created from its serialization. +// +// An opaque origin has an internal globally unique identifier. When creating a +// new opaque origin from a URL, a fresh globally unique identifier is +// generated. However, if an opaque origin is copied or moved, the internal +// globally unique identifier is preserved. Two opaque origins are same-origin +// iff the globally unique identifiers match. Unlike tuple origins, an opaque +// origin cannot be re-created from its serialization, which is always the +// string "null". +// +// IMPORTANT: Since opaque origins always serialize as the string "null", it is +// *never* safe to use the serialization for security checks! +// +// A tuple origin and an opaque origin are never same-origin. +// +// There are a few subtleties to note: +// +// * A default constructed Origin is opaque, with no precursor origin. +// +// * Invalid and non-standard GURLs are parsed as opaque origins. This includes +// non-hierarchical URLs like 'data:text/html,...' and 'javascript:alert(1)'. +// +// * GURLs with schemes of 'filesystem' or 'blob' parse the origin out of the +// internals of the URL. That is, 'filesystem:https://example.com/temporary/f' +// is parsed as ('https', 'example.com', 443). +// +// * GURLs with a 'file' scheme are tricky. They are parsed as ('file', '', 0), +// but their behavior may differ from embedder to embedder. +// TODO(dcheng): This behavior is not consistent with Blink's notion of file +// URLs, which always creates an opaque origin. +// +// * The host component of an IPv6 address includes brackets, just like the URL +// representation. +// +// * Constructing origins from GURLs (or from SchemeHostPort) is typically a red +// flag (this is true for `url::Origin::Create` but also to some extent for +// `url::Origin::Resolve`). See docs/security/origin-vs-url.md for more. +// +// * To answer the question "Are |this| and |that| "same-origin" with each +// other?", use |Origin::IsSameOriginWith|: +// +// if (this.IsSameOriginWith(that)) { +// // Amazingness goes here. +// } +class COMPONENT_EXPORT(URL) Origin { + public: + // Creates an opaque Origin with a nonce that is different from all previously + // existing origins. + Origin(); + + // WARNING: Converting an URL into an Origin is usually a red flag. See + // //docs/security/origin-vs-url.md for more details. Some discussion about + // deprecating the Create method can be found in https://crbug.com/1270878. + // + // Creates an Origin from `url`, as described at + // https://url.spec.whatwg.org/#origin, with the following additions: + // 1. If `url` is invalid or non-standard, an opaque Origin is constructed. + // 2. 'filesystem' URLs behave as 'blob' URLs (that is, the origin is parsed + // out of everything in the URL which follows the scheme). + // 3. 'file' URLs all parse as ("file", "", 0). + // + // WARNING: `url::Origin::Create(url)` can give unexpected results if: + // 1) `url` is "about:blank", or "about:srcdoc" (returning unique, opaque + // origin rather than the real origin of the frame) + // 2) `url` comes from a sandboxed frame (potentially returning a non-opaque + // origin, when an opaque one is needed; see also + // https://www.html5rocks.com/en/tutorials/security/sandboxed-iframes/) + // 3) Wrong `url` is used - e.g. in some navigations `base_url_for_data_url` + // might need to be used instead of relying on + // `content::NavigationHandle::GetURL`. + // + // WARNING: The returned Origin may have a different scheme and host from + // `url` (e.g. in case of blob URLs - see OriginTest.ConstructFromGURL). + // + // WARNING: data: URLs will be correctly be translated into opaque origins, + // but the precursor origin will be lost (unlike with `url::Origin::Resolve`). + static Origin Create(const GURL& url); + + // Creates an Origin for the resource `url` as if it were requested + // from the context of `base_origin`. If `url` is standard + // (in the sense that it embeds a complete origin, like http/https), + // this returns the same value as would Create(). + // + // If `url` is "about:blank" or "about:srcdoc", this returns a copy of + // `base_origin`. + // + // Otherwise, returns a new opaque origin derived from `base_origin`. + // In this case, the resulting opaque origin will inherit the tuple + // (or precursor tuple) of `base_origin`, but will not be same origin + // with `base_origin`, even if `base_origin` is already opaque. + static Origin Resolve(const GURL& url, const Origin& base_origin); + + // Copyable and movable. + Origin(const Origin&); + Origin& operator=(const Origin&); + Origin(Origin&&) noexcept; + Origin& operator=(Origin&&) noexcept; + + // Creates an Origin from a |scheme|, |host|, and |port|. All the parameters + // must be valid and canonicalized. Returns nullopt if any parameter is not + // canonical, or if all the parameters are empty. + // + // This constructor should be used in order to pass 'Origin' objects back and + // forth over IPC (as transitioning through GURL would risk potentially + // dangerous recanonicalization); other potential callers should prefer the + // 'GURL'-based constructor. + static absl::optional UnsafelyCreateTupleOriginWithoutNormalization( + base::StringPiece scheme, + base::StringPiece host, + uint16_t port); + + // Creates an origin without sanity checking that the host is canonicalized. + // This should only be used when converting between already normalized types, + // and should NOT be used for IPC. Method takes std::strings for use with move + // operators to avoid copies. + static Origin CreateFromNormalizedTuple(std::string scheme, + std::string host, + uint16_t port); + + ~Origin(); + + // For opaque origins, these return ("", "", 0). + const std::string& scheme() const { + return !opaque() ? tuple_.scheme() : base::EmptyString(); + } + const std::string& host() const { + return !opaque() ? tuple_.host() : base::EmptyString(); + } + uint16_t port() const { return !opaque() ? tuple_.port() : 0; } + + bool opaque() const { return nonce_.has_value(); } + + // An ASCII serialization of the Origin as per Section 6.2 of RFC 6454, with + // the addition that all Origins with a 'file' scheme serialize to "file://". + std::string Serialize() const; + + // Two non-opaque Origins are "same-origin" if their schemes, hosts, and ports + // are exact matches. Two opaque origins are same-origin only if their + // internal nonce values match. A non-opaque origin is never same-origin with + // an opaque origin. + bool IsSameOriginWith(const Origin& other) const; + bool operator==(const Origin& other) const { return IsSameOriginWith(other); } + bool operator!=(const Origin& other) const { + return !IsSameOriginWith(other); + } + + // Non-opaque origin is "same-origin" with `url` if their schemes, hosts, and + // ports are exact matches. Opaque origin is never "same-origin" with any + // `url`. about:blank, about:srcdoc, and invalid GURLs are never + // "same-origin" with any origin. This method is a shorthand for + // `origin.IsSameOriginWith(url::Origin::Create(url))`. + // + // See also CanBeDerivedFrom. + bool IsSameOriginWith(const GURL& url) const; + + // This method returns true for any |url| which if navigated to could result + // in an origin compatible with |this|. + bool CanBeDerivedFrom(const GURL& url) const; + + // Get the scheme, host, and port from which this origin derives. For + // a tuple Origin, this gives the same values as calling scheme(), host() + // and port(). For an opaque Origin that was created by calling + // Origin::DeriveNewOpaqueOrigin() on a precursor or Origin::Resolve(), + // this returns the tuple inherited from the precursor. + // + // If this Origin is opaque and was created via the default constructor or + // Origin::Create(), the precursor origin is unknown. + // + // Use with great caution: opaque origins should generally not inherit + // privileges from the origins they derive from. However, in some cases + // (such as restrictions on process placement, or determining the http lock + // icon) this information may be relevant to ensure that entering an + // opaque origin does not grant privileges initially denied to the original + // non-opaque origin. + // + // This method has a deliberately obnoxious name to prompt caution in its use. + const SchemeHostPort& GetTupleOrPrecursorTupleIfOpaque() const { + return tuple_; + } + + // Efficiently returns what GURL(Serialize()) would without re-parsing the + // URL. This can be used for the (rare) times a GURL representation is needed + // for an Origin. + // Note: The returned URL will not necessarily be serialized to the same value + // as the Origin would. The GURL will have an added "/" path for Origins with + // valid SchemeHostPorts and file Origins. + // + // Try not to use this method under normal circumstances, as it loses type + // information. Downstream consumers can mistake the returned GURL with a full + // URL (e.g. with a path component). + GURL GetURL() const; + + // Same as GURL::DomainIs. If |this| origin is opaque, then returns false. + bool DomainIs(base::StringPiece canonical_domain) const; + + // Allows Origin to be used as a key in STL (for example, a std::set or + // std::map). + bool operator<(const Origin& other) const; + + // Creates a new opaque origin that is guaranteed to be cross-origin to all + // currently existing origins. An origin created by this method retains its + // identity across copies. Copies are guaranteed to be same-origin to each + // other, e.g. + // + // url::Origin page = Origin::Create(GURL("http://example.com")) + // url::Origin a = page.DeriveNewOpaqueOrigin(); + // url::Origin b = page.DeriveNewOpaqueOrigin(); + // url::Origin c = a; + // url::Origin d = b; + // + // |a| and |c| are same-origin, since |c| was copied from |a|. |b| and |d| are + // same-origin as well, since |d| was copied from |b|. All other combinations + // of origins are considered cross-origin, e.g. |a| is cross-origin to |b| and + // |d|, |b| is cross-origin to |a| and |c|, |c| is cross-origin to |b| and + // |d|, and |d| is cross-origin to |a| and |c|. + Origin DeriveNewOpaqueOrigin() const; + + // Creates a string representation of the object that can be used for logging + // and debugging. It serializes the internal state, such as the nonce value + // and precursor information. + std::string GetDebugString(bool include_nonce = true) const; + +#if BUILDFLAG(IS_ANDROID) + base::android::ScopedJavaLocalRef CreateJavaObject() const; + static Origin FromJavaObject( + const base::android::JavaRef& java_origin); + static jlong CreateNative(JNIEnv* env, + const base::android::JavaRef& java_scheme, + const base::android::JavaRef& java_host, + uint16_t port, + bool is_opaque, + uint64_t tokenHighBits, + uint64_t tokenLowBits); +#endif // BUILDFLAG(IS_ANDROID) + + void WriteIntoTrace(perfetto::TracedValue context) const; + + private: + friend class blink::SecurityOrigin; + friend class blink::SecurityOriginTest; + friend class blink::StorageKey; + // SchemefulSite needs access to the serialization/deserialization logic which + // includes the nonce. + friend class net::SchemefulSite; + friend class OriginTest; + friend struct mojo::UrlOriginAdapter; + friend struct ipc_fuzzer::FuzzTraits; + friend struct mojo::StructTraits; + friend IPC::ParamTraits; + friend COMPONENT_EXPORT(URL) std::ostream& operator<<(std::ostream& out, + const Origin& origin); + friend class blink::StorageKeyTest; + + // Origin::Nonce is a wrapper around base::UnguessableToken that generates + // the random value only when the value is first accessed. The lazy generation + // allows Origin to be default-constructed quickly, without spending time + // in random number generation. + // + // TODO(nick): Should this optimization move into UnguessableToken, once it no + // longer treats the Null case specially? + class COMPONENT_EXPORT(URL) Nonce { + public: + // Creates a nonce to hold a newly-generated UnguessableToken. The actual + // token value will be generated lazily. + Nonce(); + + // Creates a nonce to hold an already-generated UnguessableToken value. This + // constructor should only be used for IPC serialization and testing -- + // regular code should never need to touch the UnguessableTokens directly, + // and the default constructor is faster. + explicit Nonce(const base::UnguessableToken& token); + + // Accessor, which lazily initializes the underlying |token_| member. + const base::UnguessableToken& token() const; + + // Do not use in cases where lazy initialization is expected! This + // accessor does not initialize the |token_| member. + const base::UnguessableToken& raw_token() const; + + // Copyable and movable. Copying a Nonce triggers lazy-initialization, + // moving it does not. + Nonce(const Nonce&); + Nonce& operator=(const Nonce&); + Nonce(Nonce&&) noexcept; + Nonce& operator=(Nonce&&) noexcept; + + // Note that operator<, used by maps type containers, will trigger |token_| + // lazy-initialization. Equality comparisons do not. + bool operator<(const Nonce& other) const; + bool operator==(const Nonce& other) const; + bool operator!=(const Nonce& other) const; + + private: + friend class OriginTest; + + // mutable to support lazy generation. + mutable base::UnguessableToken token_; + }; + + // This needs to be friended within Origin as well, since Nonce is a private + // nested class of Origin. + friend COMPONENT_EXPORT(URL) std::ostream& operator<<(std::ostream& out, + const Nonce& nonce); + + // Creates an origin without sanity checking that the host is canonicalized. + // This should only be used when converting between already normalized types, + // and should NOT be used for IPC. Method takes std::strings for use with move + // operators to avoid copies. + static Origin CreateOpaqueFromNormalizedPrecursorTuple( + std::string precursor_scheme, + std::string precursor_host, + uint16_t precursor_port, + const Nonce& nonce); + + // Creates an opaque Origin with the identity given by |nonce|, and an + // optional precursor origin given by |precursor_scheme|, |precursor_host| and + // |precursor_port|. Returns nullopt if any parameter is not canonical. When + // the precursor is unknown, the precursor parameters should be ("", "", 0). + // + // This factory method should be used in order to pass opaque Origin objects + // back and forth over IPC (as transitioning through GURL would risk + // potentially dangerous recanonicalization). + static absl::optional UnsafelyCreateOpaqueOriginWithoutNormalization( + base::StringPiece precursor_scheme, + base::StringPiece precursor_host, + uint16_t precursor_port, + const Nonce& nonce); + + // Constructs a non-opaque tuple origin. |tuple| must be valid. + explicit Origin(SchemeHostPort tuple); + + // Constructs an opaque origin derived from the |precursor| tuple, with the + // given |nonce|. + Origin(const Nonce& nonce, SchemeHostPort precursor); + + // Get the nonce associated with this origin, if it is opaque, or nullptr + // otherwise. This should be used only when trying to send an Origin across an + // IPC pipe. + const base::UnguessableToken* GetNonceForSerialization() const; + + // Serializes this Origin, including its nonce if it is opaque. If an opaque + // origin's |tuple_| is invalid nullopt is returned. If the nonce is not + // initialized, a nonce of 0 is used. Use of this method should be limited as + // an opaque origin will never be matchable in future browser sessions. + absl::optional SerializeWithNonce() const; + + // Like SerializeWithNonce(), but forces |nonce_| to be initialized prior to + // serializing. + absl::optional SerializeWithNonceAndInitIfNeeded(); + + absl::optional SerializeWithNonceImpl() const; + + // Deserializes an origin from |ToValueWithNonce|. Returns nullopt if the + // value was invalid in any way. + static absl::optional Deserialize(const std::string& value); + + // The tuple is used for both tuple origins (e.g. https://example.com:80), as + // well as for opaque origins, where it tracks the tuple origin from which + // the opaque origin was initially derived (we call this the "precursor" + // origin). + SchemeHostPort tuple_; + + // The nonce is used for maintaining identity of an opaque origin. This + // nonce is preserved when an opaque origin is copied or moved. An Origin + // is considered opaque if and only if |nonce_| holds a value. + absl::optional nonce_; +}; + +// Pretty-printers for logging. These expose the internal state of the nonce. +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, const Origin& origin); +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, const Origin::Nonce& origin); + +COMPONENT_EXPORT(URL) bool IsSameOriginWith(const GURL& a, const GURL& b); + +// DEBUG_ALIAS_FOR_ORIGIN(var_name, origin) copies `origin` into a new +// stack-allocated variable named ``. This helps ensure that the +// value of `origin` gets preserved in crash dumps. +#define DEBUG_ALIAS_FOR_ORIGIN(var_name, origin) \ + DEBUG_ALIAS_FOR_CSTR(var_name, (origin).Serialize().c_str(), 128) + +namespace debug { + +class COMPONENT_EXPORT(URL) ScopedOriginCrashKey { + public: + ScopedOriginCrashKey(base::debug::CrashKeyString* crash_key, + const url::Origin* value); + ~ScopedOriginCrashKey(); + + ScopedOriginCrashKey(const ScopedOriginCrashKey&) = delete; + ScopedOriginCrashKey& operator=(const ScopedOriginCrashKey&) = delete; + + private: + base::debug::ScopedCrashKeyString scoped_string_value_; +}; + +} // namespace debug + +} // namespace url + +#endif // URL_ORIGIN_H_ diff --git a/origin_abstract_tests.cc b/origin_abstract_tests.cc new file mode 100644 index 00000000000..1bc032e4eb0 --- /dev/null +++ b/origin_abstract_tests.cc @@ -0,0 +1,104 @@ +// Copyright 2021 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/origin_abstract_tests.h" + +namespace url { + +void ExpectParsedUrlsEqual(const GURL& a, const GURL& b) { + EXPECT_EQ(a, b); + const Parsed& a_parsed = a.parsed_for_possibly_invalid_spec(); + const Parsed& b_parsed = b.parsed_for_possibly_invalid_spec(); + EXPECT_EQ(a_parsed.scheme.begin, b_parsed.scheme.begin); + EXPECT_EQ(a_parsed.scheme.len, b_parsed.scheme.len); + EXPECT_EQ(a_parsed.username.begin, b_parsed.username.begin); + EXPECT_EQ(a_parsed.username.len, b_parsed.username.len); + EXPECT_EQ(a_parsed.password.begin, b_parsed.password.begin); + EXPECT_EQ(a_parsed.password.len, b_parsed.password.len); + EXPECT_EQ(a_parsed.host.begin, b_parsed.host.begin); + EXPECT_EQ(a_parsed.host.len, b_parsed.host.len); + EXPECT_EQ(a_parsed.port.begin, b_parsed.port.begin); + EXPECT_EQ(a_parsed.port.len, b_parsed.port.len); + EXPECT_EQ(a_parsed.path.begin, b_parsed.path.begin); + EXPECT_EQ(a_parsed.path.len, b_parsed.path.len); + EXPECT_EQ(a_parsed.query.begin, b_parsed.query.begin); + EXPECT_EQ(a_parsed.query.len, b_parsed.query.len); + EXPECT_EQ(a_parsed.ref.begin, b_parsed.ref.begin); + EXPECT_EQ(a_parsed.ref.len, b_parsed.ref.len); +} + +// static +Origin UrlOriginTestTraits::CreateOriginFromString(base::StringPiece s) { + return Origin::Create(GURL(s)); +} + +// static +Origin UrlOriginTestTraits::CreateUniqueOpaqueOrigin() { + return Origin(); +} + +// static +Origin UrlOriginTestTraits::CreateWithReferenceOrigin( + base::StringPiece url, + const Origin& reference_origin) { + return Origin::Resolve(GURL(url), reference_origin); +} + +// static +Origin UrlOriginTestTraits::DeriveNewOpaqueOrigin( + const Origin& reference_origin) { + return reference_origin.DeriveNewOpaqueOrigin(); +} + +// static +bool UrlOriginTestTraits::IsOpaque(const Origin& origin) { + return origin.opaque(); +} + +// static +std::string UrlOriginTestTraits::GetScheme(const Origin& origin) { + return origin.scheme(); +} + +// static +std::string UrlOriginTestTraits::GetHost(const Origin& origin) { + return origin.host(); +} + +// static +uint16_t UrlOriginTestTraits::GetPort(const Origin& origin) { + return origin.port(); +} + +// static +SchemeHostPort UrlOriginTestTraits::GetTupleOrPrecursorTupleIfOpaque( + const Origin& origin) { + return origin.GetTupleOrPrecursorTupleIfOpaque(); +} + +// static +bool UrlOriginTestTraits::IsSameOrigin(const Origin& a, const Origin& b) { + return a.IsSameOriginWith(b); +} + +// static +std::string UrlOriginTestTraits::Serialize(const Origin& origin) { + std::string serialized = origin.Serialize(); + + // Extra test assertion for GetURL (which doesn't have an equivalent in + // blink::SecurityOrigin). + ExpectParsedUrlsEqual(GURL(serialized), origin.GetURL()); + + return serialized; +} + +// static +bool UrlOriginTestTraits::IsValidUrl(base::StringPiece str) { + return GURL(str).is_valid(); +} + +// This is an abstract test suite which is instantiated by each implementation. +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AbstractOriginTest); + +} // namespace url diff --git a/origin_abstract_tests.h b/origin_abstract_tests.h new file mode 100644 index 00000000000..63dded619f3 --- /dev/null +++ b/origin_abstract_tests.h @@ -0,0 +1,527 @@ +// Copyright 2020 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_ORIGIN_ABSTRACT_TESTS_H_ +#define URL_ORIGIN_ABSTRACT_TESTS_H_ + +#include +#include + +#include "base/containers/contains.h" +#include "base/strings/string_piece.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/origin.h" +#include "url/scheme_host_port.h" +#include "url/url_util.h" + +namespace url { + +void ExpectParsedUrlsEqual(const GURL& a, const GURL& b); + +// AbstractOriginTest below abstracts away differences between url::Origin and +// blink::SecurityOrigin by parametrizing the tests with a class that has to +// expose the same public members as UrlOriginTestTraits below. +class UrlOriginTestTraits { + public: + using OriginType = Origin; + + // Constructing an origin. + static OriginType CreateOriginFromString(base::StringPiece s); + static OriginType CreateUniqueOpaqueOrigin(); + static OriginType CreateWithReferenceOrigin( + base::StringPiece url, + const OriginType& reference_origin); + static OriginType DeriveNewOpaqueOrigin(const OriginType& reference_origin); + + // Accessors for origin properties. + static bool IsOpaque(const OriginType& origin); + static std::string GetScheme(const OriginType& origin); + static std::string GetHost(const OriginType& origin); + static uint16_t GetPort(const OriginType& origin); + static SchemeHostPort GetTupleOrPrecursorTupleIfOpaque( + const OriginType& origin); + + // Wrappers for other instance methods of OriginType. + static bool IsSameOrigin(const OriginType& a, const OriginType& b); + static std::string Serialize(const OriginType& origin); + + // "Accessors" of URL properties. + // + // TODO(lukasza): Consider merging together OriginTraitsBase here and + // UrlTraitsBase in //url/gurl_abstract_tests.h. + static bool IsValidUrl(base::StringPiece str); + + // Only static members = no constructors are needed. + UrlOriginTestTraits() = delete; +}; + +// Test suite for tests that cover both url::Origin and blink::SecurityOrigin. +template +class AbstractOriginTest : public testing::Test { + public: + void SetUp() override { + const char* kSchemesToRegister[] = { + "noaccess", + "std-with-host", + "noaccess-std-with-host", + "local", + "local-noaccess", + "local-std-with-host", + "local-noaccess-std-with-host", + "also-local", + "sec", + "sec-std-with-host", + "sec-noaccess", + }; + for (const char* kScheme : kSchemesToRegister) { + std::string scheme(kScheme); + if (base::Contains(scheme, "noaccess")) + AddNoAccessScheme(kScheme); + if (base::Contains(scheme, "std-with-host")) + AddStandardScheme(kScheme, SchemeType::SCHEME_WITH_HOST); + if (base::Contains(scheme, "local")) + AddLocalScheme(kScheme); + if (base::Contains(scheme, "sec")) + AddSecureScheme(kScheme); + } + } + + protected: + // Wrappers that help ellide away TOriginTraits. + // + // Note that calling the wrappers needs to be prefixed with `this->...` to + // avoid hitting: explicit qualification required to use member 'IsOpaque' + // from dependent base class. + using OriginType = typename TOriginTraits::OriginType; + OriginType CreateOriginFromString(base::StringPiece s) { + return TOriginTraits::CreateOriginFromString(s); + } + OriginType CreateUniqueOpaqueOrigin() { + return TOriginTraits::CreateUniqueOpaqueOrigin(); + } + OriginType CreateWithReferenceOrigin(base::StringPiece url, + const OriginType& reference_origin) { + return TOriginTraits::CreateWithReferenceOrigin(url, reference_origin); + } + OriginType DeriveNewOpaqueOrigin(const OriginType& reference_origin) { + return TOriginTraits::DeriveNewOpaqueOrigin(reference_origin); + } + bool IsOpaque(const OriginType& origin) { + return TOriginTraits::IsOpaque(origin); + } + std::string GetScheme(const OriginType& origin) { + return TOriginTraits::GetScheme(origin); + } + std::string GetHost(const OriginType& origin) { + return TOriginTraits::GetHost(origin); + } + uint16_t GetPort(const OriginType& origin) { + return TOriginTraits::GetPort(origin); + } + SchemeHostPort GetTupleOrPrecursorTupleIfOpaque(const OriginType& origin) { + return TOriginTraits::GetTupleOrPrecursorTupleIfOpaque(origin); + } + bool IsSameOrigin(const OriginType& a, const OriginType& b) { + bool is_a_same_with_b = TOriginTraits::IsSameOrigin(a, b); + bool is_b_same_with_a = TOriginTraits::IsSameOrigin(b, a); + EXPECT_EQ(is_a_same_with_b, is_b_same_with_a); + return is_a_same_with_b; + } + std::string Serialize(const OriginType& origin) { + return TOriginTraits::Serialize(origin); + } + bool IsValidUrl(base::StringPiece str) { + return TOriginTraits::IsValidUrl(str); + } + +#define EXPECT_SAME_ORIGIN(a, b) \ + EXPECT_TRUE(this->IsSameOrigin((a), (b))) \ + << "When checking if \"" << this->Serialize(a) << "\" is " \ + << "same-origin with \"" << this->Serialize(b) << "\"" + +#define EXPECT_CROSS_ORIGIN(a, b) \ + EXPECT_FALSE(this->IsSameOrigin((a), (b))) \ + << "When checking if \"" << this->Serialize(a) << "\" is " \ + << "cross-origin from \"" << this->Serialize(b) << "\"" + + void VerifyOriginInvariants(const OriginType& origin) { + // An origin is always same-origin with itself. + EXPECT_SAME_ORIGIN(origin, origin); + + // A copy of |origin| should be same-origin as well. + auto origin_copy = origin; + EXPECT_EQ(this->GetScheme(origin), this->GetScheme(origin_copy)); + EXPECT_EQ(this->GetHost(origin), this->GetHost(origin_copy)); + EXPECT_EQ(this->GetPort(origin), this->GetPort(origin_copy)); + EXPECT_EQ(this->IsOpaque(origin), this->IsOpaque(origin_copy)); + EXPECT_SAME_ORIGIN(origin, origin_copy); + + // An origin is always cross-origin from another, unique, opaque origin. + EXPECT_CROSS_ORIGIN(origin, this->CreateUniqueOpaqueOrigin()); + + // An origin is always cross-origin from another tuple origin. + auto different_tuple_origin = + this->CreateOriginFromString("https://not-in-the-list.test/"); + EXPECT_CROSS_ORIGIN(origin, different_tuple_origin); + + // Deriving an origin for "about:blank". + auto about_blank_origin1 = + this->CreateWithReferenceOrigin("about:blank", origin); + auto about_blank_origin2 = + this->CreateWithReferenceOrigin("about:blank?bar#foo", origin); + EXPECT_SAME_ORIGIN(origin, about_blank_origin1); + EXPECT_SAME_ORIGIN(origin, about_blank_origin2); + + // Derived opaque origins. + std::vector derived_origins = { + this->DeriveNewOpaqueOrigin(origin), + this->CreateWithReferenceOrigin("data:text/html,baz", origin), + this->DeriveNewOpaqueOrigin(about_blank_origin1), + }; + for (size_t i = 0; i < derived_origins.size(); i++) { + SCOPED_TRACE(testing::Message() << "Derived origin #" << i); + const OriginType& derived_origin = derived_origins[i]; + EXPECT_TRUE(this->IsOpaque(derived_origin)); + EXPECT_SAME_ORIGIN(derived_origin, derived_origin); + EXPECT_CROSS_ORIGIN(origin, derived_origin); + EXPECT_EQ(this->GetTupleOrPrecursorTupleIfOpaque(origin), + this->GetTupleOrPrecursorTupleIfOpaque(derived_origin)); + } + } + + void VerifyUniqueOpaqueOriginInvariants(const OriginType& origin) { + if (!this->IsOpaque(origin)) { + ADD_FAILURE() << "Got unexpectedly non-opaque origin: " + << this->Serialize(origin); + return; // Skip other test assertions. + } + + // Opaque origins should have an "empty" scheme, host and port. + EXPECT_EQ("", this->GetScheme(origin)); + EXPECT_EQ("", this->GetHost(origin)); + EXPECT_EQ(0, this->GetPort(origin)); + + // Unique opaque origins should have an empty precursor tuple. + EXPECT_EQ(SchemeHostPort(), this->GetTupleOrPrecursorTupleIfOpaque(origin)); + + // Serialization test. + EXPECT_EQ("null", this->Serialize(origin)); + + // Invariants that should hold for any origin. + VerifyOriginInvariants(origin); + } + + void TestUniqueOpaqueOrigin(base::StringPiece test_input) { + auto origin = this->CreateOriginFromString(test_input); + this->VerifyUniqueOpaqueOriginInvariants(origin); + + // Re-creating from the URL should be cross-origin. + auto origin_recreated_from_same_input = + this->CreateOriginFromString(test_input); + EXPECT_CROSS_ORIGIN(origin, origin_recreated_from_same_input); + } + + void VerifyTupleOriginInvariants(const OriginType& origin, + const SchemeHostPort& expected_tuple) { + if (this->IsOpaque(origin)) { + ADD_FAILURE() << "Got unexpectedly opaque origin"; + return; // Skip other test assertions. + } + SCOPED_TRACE(testing::Message() + << "Actual origin: " << this->Serialize(origin)); + + // Compare `origin` against the `expected_tuple`. + EXPECT_EQ(expected_tuple.scheme(), this->GetScheme(origin)); + EXPECT_EQ(expected_tuple.host(), this->GetHost(origin)); + EXPECT_EQ(expected_tuple.port(), this->GetPort(origin)); + EXPECT_EQ(expected_tuple, this->GetTupleOrPrecursorTupleIfOpaque(origin)); + + // Serialization test. + // + // TODO(lukasza): Consider preserving the hostname when serializing file: + // URLs. Dropping the hostname seems incompatible with section 6 of + // rfc6454. Even though section 4 says that "the implementation MAY + // return an implementation-defined value", it seems that Chromium + // implementation *does* include the hostname in the origin SchemeHostPort + // tuple. + if (expected_tuple.scheme() != kFileScheme || expected_tuple.host() == "") { + EXPECT_SAME_ORIGIN(origin, + this->CreateOriginFromString(this->Serialize(origin))); + } + + // Invariants that should hold for any origin. + VerifyOriginInvariants(origin); + } + + private: + ScopedSchemeRegistryForTests scoped_scheme_registry_; +}; + +TYPED_TEST_SUITE_P(AbstractOriginTest); + +TYPED_TEST_P(AbstractOriginTest, NonStandardSchemeWithAndroidWebViewHack) { + EnableNonStandardSchemesForAndroidWebView(); + + // Regression test for https://crbug.com/896059. + auto origin = this->CreateOriginFromString("unknown-scheme://"); + EXPECT_FALSE(this->IsOpaque(origin)); + EXPECT_EQ("unknown-scheme", this->GetScheme(origin)); + EXPECT_EQ("", this->GetHost(origin)); + EXPECT_EQ(0, this->GetPort(origin)); + + // about:blank translates into an opaque origin, even in presence of + // EnableNonStandardSchemesForAndroidWebView. + origin = this->CreateOriginFromString("about:blank"); + EXPECT_TRUE(this->IsOpaque(origin)); +} + +TYPED_TEST_P(AbstractOriginTest, OpaqueOriginsFromValidUrls) { + const char* kTestCases[] = { + // Built-in noaccess schemes. + "data:text/html,Hello!", + "javascript:alert(1)", + "about:blank", + + // Opaque blob URLs. + "blob:null/foo", // blob:null (actually a valid URL) + "blob:data:foo", // blob + data (which is nonstandard) + "blob:about://blank/", // blob + about (which is nonstandard) + "blob:about:blank/", // blob + about (which is nonstandard) + "blob:blob:http://www.example.com/guid-goes-here", + "blob:filesystem:ws:b/.", + "blob:filesystem:ftp://a/b", + "blob:blob:file://localhost/foo/bar", + }; + + for (const char* test_input : kTestCases) { + SCOPED_TRACE(testing::Message() << "Test input: " << test_input); + + // Verify that `origin` is opaque not just because `test_input` results is + // an invalid URL (because of a typo in the scheme name, or because of a + // technicality like having no host in a noaccess-std-with-host: scheme). + EXPECT_TRUE(this->IsValidUrl(test_input)); + + this->TestUniqueOpaqueOrigin(test_input); + } +} + +TYPED_TEST_P(AbstractOriginTest, OpaqueOriginsFromInvalidUrls) { + // TODO(lukasza): Consider moving those to GURL/KURL tests that verify what + // inputs are parsed as an invalid URL. + + const char* kTestCases[] = { + // Invalid file: URLs. + "file://example.com:443/etc/passwd", // No port expected. + + // Invalid HTTP URLs. + "http", + "http:", + "http:/", + "http://", + "http://:", + "http://:1", + "http::///invalid.example.com/", + "http://example.com:65536/", // Port out of range. + "http://example.com:-1/", // Port out of range. + "http://example.com:18446744073709551616/", // Port = 2^64. + "http://example.com:18446744073709551616999/", // Lots of port digits. + + // Invalid filesystem URLs. + "filesystem:http://example.com/", // Missing /type/. + "filesystem:local:baz./type/", + "filesystem:local://hostname/type/", + "filesystem:unknown-scheme://hostname/type/", + "filesystem:filesystem:http://example.org:88/foo/bar", + + // Invalid IP addresses + "http://[]/", + "http://[2001:0db8:0000:0000:0000:0000:0000:0000:0001]/", // 9 groups. + + // Unknown scheme without a colon character (":") gives an invalid URL. + "unknown-scheme", + + // Standard schemes require a hostname (and result in an opaque origin if + // the hostname is missing). + "local-std-with-host:", + "noaccess-std-with-host:", + }; + + for (const char* test_input : kTestCases) { + SCOPED_TRACE(testing::Message() << "Test input: " << test_input); + + // All testcases here are expected to represent invalid URLs. + // an invalid URL (because of a type in scheme name, or because of a + // technicality like having no host in a noaccess-std-with-host: scheme). + EXPECT_FALSE(this->IsValidUrl(test_input)); + + // Invalid URLs should always result in an opaque origin. + this->TestUniqueOpaqueOrigin(test_input); + } +} + +TYPED_TEST_P(AbstractOriginTest, TupleOrigins) { + struct TestCase { + const char* input; + SchemeHostPort expected_tuple; + } kTestCases[] = { + // file: URLs + {"file:///etc/passwd", {"file", "", 0}}, + {"file://example.com/etc/passwd", {"file", "example.com", 0}}, + {"file:///", {"file", "", 0}}, + {"file://hostname/C:/dir/file.txt", {"file", "hostname", 0}}, + + // HTTP URLs + {"http://example.com/", {"http", "example.com", 80}}, + {"http://example.com:80/", {"http", "example.com", 80}}, + {"http://example.com:123/", {"http", "example.com", 123}}, + {"http://example.com:0/", {"http", "example.com", 0}}, + {"http://example.com:65535/", {"http", "example.com", 65535}}, + {"https://example.com/", {"https", "example.com", 443}}, + {"https://example.com:443/", {"https", "example.com", 443}}, + {"https://example.com:123/", {"https", "example.com", 123}}, + {"https://example.com:0/", {"https", "example.com", 0}}, + {"https://example.com:65535/", {"https", "example.com", 65535}}, + {"http://user:pass@example.com/", {"http", "example.com", 80}}, + {"http://example.com:123/?query", {"http", "example.com", 123}}, + {"https://example.com/#1234", {"https", "example.com", 443}}, + {"https://u:p@example.com:123/?query#1234", + {"https", "example.com", 123}}, + {"http://example/", {"http", "example", 80}}, + + // Blob URLs. + {"blob:http://example.com/guid-goes-here", {"http", "example.com", 80}}, + {"blob:http://example.com:123/guid-goes-here", + {"http", "example.com", 123}}, + {"blob:https://example.com/guid-goes-here", + {"https", "example.com", 443}}, + {"blob:http://u:p@example.com/guid-goes-here", + {"http", "example.com", 80}}, + + // Filesystem URLs. + {"filesystem:http://example.com/type/", {"http", "example.com", 80}}, + {"filesystem:http://example.com:123/type/", {"http", "example.com", 123}}, + {"filesystem:https://example.com/type/", {"https", "example.com", 443}}, + {"filesystem:https://example.com:123/type/", + {"https", "example.com", 123}}, + {"filesystem:local-std-with-host:baz./type/", + {"local-std-with-host", "baz.", 0}}, + + // IP Addresses + {"http://192.168.9.1/", {"http", "192.168.9.1", 80}}, + {"http://[2001:db8::1]/", {"http", "[2001:db8::1]", 80}}, + {"http://[2001:0db8:0000:0000:0000:0000:0000:0001]/", + {"http", "[2001:db8::1]", 80}}, + {"http://1/", {"http", "0.0.0.1", 80}}, + {"http://1:1/", {"http", "0.0.0.1", 1}}, + {"http://3232237825/", {"http", "192.168.9.1", 80}}, + + // Punycode + {"http://☃.net/", {"http", "xn--n3h.net", 80}}, + {"blob:http://☃.net/", {"http", "xn--n3h.net", 80}}, + {"local-std-with-host:↑↑↓↓←→←→ba.↑↑↓↓←→←→ba.0.bg", + {"local-std-with-host", "xn--ba-rzuadaibfa.xn--ba-rzuadaibfa.0.bg", 0}}, + + // Registered URLs + {"ftp://example.com/", {"ftp", "example.com", 21}}, + {"ws://example.com/", {"ws", "example.com", 80}}, + {"wss://example.com/", {"wss", "example.com", 443}}, + {"wss://user:pass@example.com/", {"wss", "example.com", 443}}, + }; + + for (const TestCase& test : kTestCases) { + SCOPED_TRACE(testing::Message() << "Test input: " << test.input); + + // Only valid URLs should translate into valid, non-opaque origins. + EXPECT_TRUE(this->IsValidUrl(test.input)); + + auto origin = this->CreateOriginFromString(test.input); + this->VerifyTupleOriginInvariants(origin, test.expected_tuple); + } +} + +TYPED_TEST_P(AbstractOriginTest, CustomSchemes_OpaqueOrigins) { + const char* kTestCases[] = { + // Unknown scheme + "unknown-scheme:foo", + "unknown-scheme://bar", + + // Unknown scheme that is a prefix or suffix of a registered scheme. + "loca:foo", + "ocal:foo", + "local-suffix:foo", + "prefix-local:foo", + + // Custom no-access schemes translate into an opaque origin (just like the + // built-in no-access schemes such as about:blank or data:). + "noaccess-std-with-host:foo", + "noaccess-std-with-host://bar", + "noaccess://host", + "local-noaccess://host", + "local-noaccess-std-with-host://host", + }; + + for (const char* test_input : kTestCases) { + SCOPED_TRACE(testing::Message() << "Test input: " << test_input); + + // Verify that `origin` is opaque not just because `test_input` results is + // an invalid URL (because of a typo in the scheme name, or because of a + // technicality like having no host in a noaccess-std-with-host: scheme). + EXPECT_TRUE(this->IsValidUrl(test_input)); + + this->TestUniqueOpaqueOrigin(test_input); + } +} + +TYPED_TEST_P(AbstractOriginTest, CustomSchemes_TupleOrigins) { + struct TestCase { + const char* input; + SchemeHostPort expected_tuple; + } kTestCases[] = { + // Scheme (registered in SetUp()) that's both local and standard. + // TODO: Is it really appropriate to do network-host canonicalization of + // schemes without ports? + {"local-std-with-host:20", {"local-std-with-host", "0.0.0.20", 0}}, + {"local-std-with-host:20.", {"local-std-with-host", "0.0.0.20", 0}}, + {"local-std-with-host:foo", {"local-std-with-host", "foo", 0}}, + {"local-std-with-host://bar:20", {"local-std-with-host", "bar", 0}}, + {"local-std-with-host:baz.", {"local-std-with-host", "baz.", 0}}, + {"local-std-with-host:baz..", {"local-std-with-host", "baz..", 0}}, + {"local-std-with-host:baz..bar", {"local-std-with-host", "baz..bar", 0}}, + {"local-std-with-host:baz...", {"local-std-with-host", "baz...", 0}}, + + // Scheme (registered in SetUp()) that's local but nonstandard. These + // always have empty hostnames, but are allowed to be url::Origins. + {"local:", {"local", "", 0}}, + {"local:foo", {"local", "", 0}}, + {"local://bar", {"local", "", 0}}, + {"also-local://bar", {"also-local", "", 0}}, + + {"std-with-host://host", {"std-with-host", "host", 0}}, + {"local://host", {"local", "", 0}}, + {"local-std-with-host://host", {"local-std-with-host", "host", 0}}, + }; + + for (const TestCase& test : kTestCases) { + SCOPED_TRACE(testing::Message() << "Test input: " << test.input); + + // Only valid URLs should translate into valid, non-opaque origins. + EXPECT_TRUE(this->IsValidUrl(test.input)); + + auto origin = this->CreateOriginFromString(test.input); + this->VerifyTupleOriginInvariants(origin, test.expected_tuple); + } +} + +REGISTER_TYPED_TEST_SUITE_P(AbstractOriginTest, + NonStandardSchemeWithAndroidWebViewHack, + OpaqueOriginsFromValidUrls, + OpaqueOriginsFromInvalidUrls, + TupleOrigins, + CustomSchemes_OpaqueOrigins, + CustomSchemes_TupleOrigins); + +} // namespace url + +#endif // URL_ORIGIN_ABSTRACT_TESTS_H_ diff --git a/origin_unittest.cc b/origin_unittest.cc new file mode 100644 index 00000000000..47cca812a65 --- /dev/null +++ b/origin_unittest.cc @@ -0,0 +1,777 @@ +// Copyright 2015 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include + +#include "base/memory/raw_ptr.h" +#include "testing/gmock/include/gmock/gmock.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/origin.h" +#include "url/origin_abstract_tests.h" +#include "url/url_util.h" + +namespace url { + +class OriginTest : public ::testing::Test { + public: + void SetUp() override { + // Add two schemes which are local but nonstandard. + AddLocalScheme("local-but-nonstandard"); + AddLocalScheme("also-local-but-nonstandard"); + + // Add a scheme that's both local and standard. + AddStandardScheme("local-and-standard", SchemeType::SCHEME_WITH_HOST); + AddLocalScheme("local-and-standard"); + + // Add a scheme that's standard but no-access. We still want these to + // form valid SchemeHostPorts, even though they always commit as opaque + // origins, so that they can represent the source of the resource even if + // it's not committable as a non-opaque origin. + AddStandardScheme("standard-but-noaccess", SchemeType::SCHEME_WITH_HOST); + AddNoAccessScheme("standard-but-noaccess"); + } + + ::testing::AssertionResult DoEqualityComparisons(const url::Origin& a, + const url::Origin& b, + bool should_compare_equal) { + ::testing::AssertionResult failure = ::testing::AssertionFailure(); + failure << "DoEqualityComparisons failure. Expecting " + << (should_compare_equal ? "equality" : "inequality") + << " between:\n a\n Which is: " << a + << "\n b\n Which is: " << b << "\nThe following check failed: "; + if (a.IsSameOriginWith(b) != should_compare_equal) + return failure << "a.IsSameOriginWith(b)"; + if (b.IsSameOriginWith(a) != should_compare_equal) + return failure << "b.IsSameOriginWith(a)"; + if ((a == b) != should_compare_equal) + return failure << "(a == b)"; + if ((b == a) != should_compare_equal) + return failure << "(b == a)"; + if ((b != a) != !should_compare_equal) + return failure << "(b != a)"; + if ((a != b) != !should_compare_equal) + return failure << "(a != b)"; + return ::testing::AssertionSuccess(); + } + + bool HasNonceTokenBeenInitialized(const url::Origin& origin) { + EXPECT_TRUE(origin.opaque()); + // Avoid calling nonce_.token() here, to not trigger lazy initialization. + return !origin.nonce_->token_.is_empty(); + } + + Origin::Nonce CreateNonce() { return Origin::Nonce(); } + + Origin::Nonce CreateNonce(base::UnguessableToken nonce) { + return Origin::Nonce(nonce); + } + + const base::UnguessableToken* GetNonce(const Origin& origin) { + return origin.GetNonceForSerialization(); + } + + // Wrappers around url::Origin methods to expose it to tests. + + absl::optional UnsafelyCreateOpaqueOriginWithoutNormalization( + base::StringPiece precursor_scheme, + base::StringPiece precursor_host, + uint16_t precursor_port, + const Origin::Nonce& nonce) { + return Origin::UnsafelyCreateOpaqueOriginWithoutNormalization( + precursor_scheme, precursor_host, precursor_port, nonce); + } + + absl::optional SerializeWithNonce(const Origin& origin) { + return origin.SerializeWithNonce(); + } + + absl::optional SerializeWithNonceAndInitIfNeeded( + Origin& origin) { + return origin.SerializeWithNonceAndInitIfNeeded(); + } + + absl::optional Deserialize(const std::string& value) { + return Origin::Deserialize(value); + } + + private: + ScopedSchemeRegistryForTests scoped_registry_; +}; + +TEST_F(OriginTest, OpaqueOriginComparison) { + // A default-constructed Origin should should be cross origin to everything + // but itself. + url::Origin opaque_a, opaque_b; + EXPECT_TRUE(opaque_a.opaque()); + EXPECT_EQ("", opaque_a.scheme()); + EXPECT_EQ("", opaque_a.host()); + EXPECT_EQ(0, opaque_a.port()); + EXPECT_EQ(SchemeHostPort(), opaque_a.GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_FALSE(opaque_a.GetTupleOrPrecursorTupleIfOpaque().IsValid()); + + EXPECT_TRUE(opaque_b.opaque()); + EXPECT_EQ("", opaque_b.scheme()); + EXPECT_EQ("", opaque_b.host()); + EXPECT_EQ(0, opaque_b.port()); + EXPECT_EQ(SchemeHostPort(), opaque_b.GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_FALSE(opaque_b.GetTupleOrPrecursorTupleIfOpaque().IsValid()); + + // Two default-constructed Origins should always be cross origin to each + // other. + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + // The streaming operator should not trigger lazy initialization to the token. + std::ostringstream stream; + stream << opaque_a; + EXPECT_STREQ("null [internally: (nonce TBD) anonymous]", + stream.str().c_str()); + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + + // None of the operations thus far should have triggered lazy-generation of + // the UnguessableToken. Copying an origin, however, should trigger this. + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_b)); + opaque_b = opaque_a; + + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + // Move-initializing to a fresh Origin should restore the lazy initialization. + opaque_a = url::Origin(); + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + // Comparing two opaque Origins with matching SchemeHostPorts should trigger + // lazy initialization. + EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + bool should_swap = opaque_b < opaque_a; + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_a)); + EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b)); + + if (should_swap) + std::swap(opaque_a, opaque_b); + EXPECT_LT(opaque_a, opaque_b); + EXPECT_FALSE(opaque_b < opaque_a); + + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false)); + EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true)); + EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true)); + + EXPECT_LT(opaque_a, url::Origin::Create(GURL("http://www.google.com"))); + EXPECT_LT(opaque_b, url::Origin::Create(GURL("http://www.google.com"))); + + EXPECT_EQ(opaque_b, url::Origin::Resolve(GURL(), opaque_b)); + EXPECT_EQ(opaque_b, url::Origin::Resolve(GURL("about:blank"), opaque_b)); + EXPECT_EQ(opaque_b, url::Origin::Resolve(GURL("about:srcdoc"), opaque_b)); + EXPECT_EQ(opaque_b, + url::Origin::Resolve(GURL("about:blank?hello#whee"), opaque_b)); +} + +TEST_F(OriginTest, ConstructFromTuple) { + struct TestCases { + const char* const scheme; + const char* const host; + const uint16_t port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"https", "example.com", 443}, + }; + + for (const auto& test_case : cases) { + testing::Message scope_message; + scope_message << test_case.scheme << "://" << test_case.host << ":" + << test_case.port; + SCOPED_TRACE(scope_message); + Origin origin = Origin::CreateFromNormalizedTuple( + test_case.scheme, test_case.host, test_case.port); + + EXPECT_EQ(test_case.scheme, origin.scheme()); + EXPECT_EQ(test_case.host, origin.host()); + EXPECT_EQ(test_case.port, origin.port()); + } +} + +TEST_F(OriginTest, Serialization) { + struct TestCases { + const char* const url; + const char* const expected; + const char* const expected_log; + } cases[] = { + {"http://192.168.9.1/", "http://192.168.9.1"}, + {"http://[2001:db8::1]/", "http://[2001:db8::1]"}, + {"http://☃.net/", "http://xn--n3h.net"}, + {"http://example.com/", "http://example.com"}, + {"http://example.com:123/", "http://example.com:123"}, + {"https://example.com/", "https://example.com"}, + {"https://example.com:123/", "https://example.com:123"}, + {"file:///etc/passwd", "file://", "file:// [internally: file://]"}, + {"file://example.com/etc/passwd", "file://", + "file:// [internally: file://example.com]"}, + {"data:,", "null", "null [internally: (nonce TBD) anonymous]"}, + }; + + for (const auto& test_case : cases) { + SCOPED_TRACE(test_case.url); + GURL url(test_case.url); + EXPECT_TRUE(url.is_valid()); + Origin origin = Origin::Create(url); + std::string serialized = origin.Serialize(); + ExpectParsedUrlsEqual(GURL(serialized), origin.GetURL()); + + EXPECT_EQ(test_case.expected, serialized); + + // The '<<' operator sometimes produces additional information. + std::stringstream out; + out << origin; + if (test_case.expected_log) + EXPECT_EQ(test_case.expected_log, out.str()); + else + EXPECT_EQ(test_case.expected, out.str()); + } +} + +TEST_F(OriginTest, Comparison) { + // These URLs are arranged in increasing order: + const char* const urls[] = { + "data:uniqueness", "http://a:80", "http://b:80", + "https://a:80", "https://b:80", "http://a:81", + "http://b:81", "https://a:81", "https://b:81", + }; + // Validate the comparison logic still works when creating a canonical origin, + // when any created opaque origins contain a nonce. + { + // Pre-create the origins, as the internal nonce for unique origins changes + // with each freshly-constructed Origin (that's not copied). + std::vector origins; + for (const auto* test_url : urls) + origins.push_back(Origin::Create(GURL(test_url))); + for (size_t i = 0; i < origins.size(); i++) { + const Origin& current = origins[i]; + for (size_t j = i; j < origins.size(); j++) { + const Origin& to_compare = origins[j]; + EXPECT_EQ(i < j, current < to_compare) << i << " < " << j; + EXPECT_EQ(j < i, to_compare < current) << j << " < " << i; + } + } + } +} + +TEST_F(OriginTest, UnsafelyCreate) { + struct TestCase { + const char* scheme; + const char* host; + uint16_t port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"https", "example.com", 443}, + {"https", "example.com", 123}, + {"http", "example.com", 0}, // 0 is a valid port for http. + {"file", "", 0}, // 0 indicates "no port" for file: scheme. + {"file", "example.com", 0}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() + << test.scheme << "://" << test.host << ":" << test.port); + absl::optional origin = + url::Origin::UnsafelyCreateTupleOriginWithoutNormalization( + test.scheme, test.host, test.port); + ASSERT_TRUE(origin); + EXPECT_EQ(test.scheme, origin->scheme()); + EXPECT_EQ(test.host, origin->host()); + EXPECT_EQ(test.port, origin->port()); + EXPECT_FALSE(origin->opaque()); + EXPECT_TRUE(origin->IsSameOriginWith(*origin)); + + ExpectParsedUrlsEqual(GURL(origin->Serialize()), origin->GetURL()); + + base::UnguessableToken nonce = base::UnguessableToken::Create(); + absl::optional opaque_origin = + UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce(nonce)); + ASSERT_TRUE(opaque_origin); + EXPECT_TRUE(opaque_origin->opaque()); + EXPECT_FALSE(*opaque_origin == origin); + EXPECT_EQ(opaque_origin->GetTupleOrPrecursorTupleIfOpaque(), + origin->GetTupleOrPrecursorTupleIfOpaque()); + EXPECT_EQ(opaque_origin, + UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce(nonce))); + EXPECT_FALSE(*opaque_origin == origin->DeriveNewOpaqueOrigin()); + } +} + +TEST_F(OriginTest, UnsafelyCreateUniqueOnInvalidInput) { + url::AddStandardScheme("host-only", url::SCHEME_WITH_HOST); + url::AddStandardScheme("host-port-only", url::SCHEME_WITH_HOST_AND_PORT); + struct TestCases { + const char* scheme; + const char* host; + uint16_t port = 80; + } cases[] = {{"", "", 33}, + {"data", "", 0}, + {"blob", "", 0}, + {"filesystem", "", 0}, + {"data", "example.com"}, + {"http", "☃.net"}, + {"http\nmore", "example.com"}, + {"http\rmore", "example.com"}, + {"http\n", "example.com"}, + {"http\r", "example.com"}, + {"http", "example.com\nnot-example.com"}, + {"http", "example.com\rnot-example.com"}, + {"http", "example.com\n"}, + {"http", "example.com\r"}, + {"unknown-scheme", "example.com"}, + {"host-only", "\r", 0}, + {"host-only", "example.com", 22}, + {"file", "", 123}}; // file: shouldn't have a port. + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() + << test.scheme << "://" << test.host << ":" << test.port); + EXPECT_FALSE(UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce())); + EXPECT_FALSE(url::Origin::UnsafelyCreateTupleOriginWithoutNormalization( + test.scheme, test.host, test.port)); + } + + // An empty scheme/host/port tuple is not a valid tuple origin. + EXPECT_FALSE( + url::Origin::UnsafelyCreateTupleOriginWithoutNormalization("", "", 0)); + + // Opaque origins with unknown precursors are allowed. + base::UnguessableToken token = base::UnguessableToken::Create(); + absl::optional anonymous_opaque = + UnsafelyCreateOpaqueOriginWithoutNormalization("", "", 0, + CreateNonce(token)); + ASSERT_TRUE(anonymous_opaque) + << "An invalid tuple is a valid input to " + << "UnsafelyCreateOpaqueOriginWithoutNormalization, so long as it is " + << "the canonical form of the invalid tuple."; + EXPECT_TRUE(anonymous_opaque->opaque()); + EXPECT_EQ(*GetNonce(anonymous_opaque.value()), token); + EXPECT_EQ(anonymous_opaque->GetTupleOrPrecursorTupleIfOpaque(), + url::SchemeHostPort()); +} + +TEST_F(OriginTest, UnsafelyCreateUniqueViaEmbeddedNulls) { + struct TestCases { + base::StringPiece scheme; + base::StringPiece host; + uint16_t port = 80; + } cases[] = {{{"http\0more", 9}, {"example.com", 11}}, + {{"http\0", 5}, {"example.com", 11}}, + {{"\0http", 5}, {"example.com", 11}}, + {{"http"}, {"example.com\0not-example.com", 27}}, + {{"http"}, {"example.com\0", 12}}, + {{"http"}, {"\0example.com", 12}}, + {{""}, {"\0", 1}, 0}, + {{"\0", 1}, {""}, 0}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() + << test.scheme << "://" << test.host << ":" << test.port); + EXPECT_FALSE(url::Origin::UnsafelyCreateTupleOriginWithoutNormalization( + test.scheme, test.host, test.port)); + EXPECT_FALSE(UnsafelyCreateOpaqueOriginWithoutNormalization( + test.scheme, test.host, test.port, CreateNonce())); + } +} + +TEST_F(OriginTest, DomainIs) { + const struct { + const char* url; + const char* lower_ascii_domain; + bool expected_domain_is; + } kTestCases[] = { + {"http://google.com/foo", "google.com", true}, + {"http://www.google.com:99/foo", "google.com", true}, + {"http://www.google.com.cn/foo", "google.com", false}, + {"http://www.google.comm", "google.com", false}, + {"http://www.iamnotgoogle.com/foo", "google.com", false}, + {"http://www.google.com/foo", "Google.com", false}, + + // If the host ends with a dot, it matches domains with or without a dot. + {"http://www.google.com./foo", "google.com", true}, + {"http://www.google.com./foo", "google.com.", true}, + {"http://www.google.com./foo", ".com", true}, + {"http://www.google.com./foo", ".com.", true}, + + // But, if the host doesn't end with a dot and the input domain does, then + // it's considered to not match. + {"http://google.com/foo", "google.com.", false}, + + // If the host ends with two dots, it doesn't match. + {"http://www.google.com../foo", "google.com", false}, + + // Filesystem scheme. + {"filesystem:http://www.google.com:99/foo/", "google.com", true}, + {"filesystem:http://www.iamnotgoogle.com/foo/", "google.com", false}, + + // File scheme. + {"file:///home/user/text.txt", "", false}, + {"file:///home/user/text.txt", "txt", false}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() + << "(url, domain): (" << test_case.url << ", " + << test_case.lower_ascii_domain << ")"); + GURL url(test_case.url); + ASSERT_TRUE(url.is_valid()); + Origin origin = Origin::Create(url); + + EXPECT_EQ(test_case.expected_domain_is, + origin.DomainIs(test_case.lower_ascii_domain)); + EXPECT_FALSE( + origin.DeriveNewOpaqueOrigin().DomainIs(test_case.lower_ascii_domain)); + } + + // If the URL is invalid, DomainIs returns false. + GURL invalid_url("google.com"); + ASSERT_FALSE(invalid_url.is_valid()); + EXPECT_FALSE(Origin::Create(invalid_url).DomainIs("google.com")); + + // Unique origins. + EXPECT_FALSE(Origin().DomainIs("")); + EXPECT_FALSE(Origin().DomainIs("com")); +} + +TEST_F(OriginTest, DebugAlias) { + Origin origin1 = Origin::Create(GURL("https://foo.com/bar")); + DEBUG_ALIAS_FOR_ORIGIN(origin1_debug_alias, origin1); + EXPECT_STREQ("https://foo.com", origin1_debug_alias); +} + +TEST_F(OriginTest, CanBeDerivedFrom) { + AddStandardScheme("new-standard", SchemeType::SCHEME_WITH_HOST); + Origin opaque_unique_origin = Origin(); + + Origin regular_origin = Origin::Create(GURL("https://a.com/")); + Origin opaque_precursor_origin = regular_origin.DeriveNewOpaqueOrigin(); + + Origin file_origin = Origin::Create(GURL("file:///foo/bar")); + Origin file_opaque_precursor_origin = file_origin.DeriveNewOpaqueOrigin(); + Origin file_host_origin = Origin::Create(GURL("file://a.com/foo/bar")); + Origin file_host_opaque_precursor_origin = + file_host_origin.DeriveNewOpaqueOrigin(); + + Origin non_standard_scheme_origin = + Origin::Create(GURL("non-standard-scheme:foo")); + Origin non_standard_opaque_precursor_origin = + non_standard_scheme_origin.DeriveNewOpaqueOrigin(); + + // Also, add new standard scheme that is local to the test. + Origin new_standard_origin = Origin::Create(GURL("new-standard://host/")); + Origin new_standard_opaque_precursor_origin = + new_standard_origin.DeriveNewOpaqueOrigin(); + + // No access schemes always get unique opaque origins. + Origin no_access_origin = + Origin::Create(GURL("standard-but-noaccess://b.com")); + Origin no_access_opaque_precursor_origin = + no_access_origin.DeriveNewOpaqueOrigin(); + + Origin local_non_standard_origin = + Origin::Create(GURL("local-but-nonstandard://a.com")); + Origin local_non_standard_opaque_precursor_origin = + local_non_standard_origin.DeriveNewOpaqueOrigin(); + + // Call origin.CanBeDerivedFrom(url) for each of the following test cases + // and ensure that it returns |expected_value| + const struct { + const char* url; + raw_ptr origin; + bool expected_value; + } kTestCases[] = { + {"https://a.com", ®ular_origin, true}, + // Web URL can commit in an opaque origin with precursor information. + // Example: iframe sandbox navigated to a.com. + {"https://a.com", &opaque_precursor_origin, true}, + // URL that comes from the web can never commit in an opaque unique + // origin. It must have precursor information. + {"https://a.com", &opaque_unique_origin, false}, + + // Cross-origin URLs should never work. + {"https://b.com", ®ular_origin, false}, + {"https://b.com", &opaque_precursor_origin, false}, + + // data: URL can never commit in a regular, non-opaque origin. + {"data:text/html,foo", ®ular_origin, false}, + // This is the default case: data: URLs commit in opaque origin carrying + // precursor information for the origin that created them. + {"data:text/html,foo", &opaque_precursor_origin, true}, + // Browser-initiated navigations can result in data: URL committing in + // opaque unique origin. + {"data:text/html,foo", &opaque_unique_origin, true}, + + // about:blank can commit in regular origin (default case for iframes). + {"about:blank", ®ular_origin, true}, + // This can happen if data: URL that originated at a.com creates an + // about:blank iframe. + {"about:blank", &opaque_precursor_origin, true}, + // Browser-initiated navigations can result in about:blank URL committing + // in opaque unique origin. + {"about:blank", &opaque_unique_origin, true}, + + // Default behavior of srcdoc is to inherit the origin of the parent + // document. + {"about:srcdoc", ®ular_origin, true}, + // This happens for sandboxed srcdoc iframe. + {"about:srcdoc", &opaque_precursor_origin, true}, + // This can happen with browser-initiated navigation to about:blank or + // data: URL, which in turn add srcdoc iframe. + {"about:srcdoc", &opaque_unique_origin, true}, + + // Just like srcdoc, blob: URLs can be created in all the cases. + {"blob:https://a.com/foo", ®ular_origin, true}, + {"blob:https://a.com/foo", &opaque_precursor_origin, true}, + {"blob:https://a.com/foo", &opaque_unique_origin, true}, + + {"filesystem:https://a.com/foo", ®ular_origin, true}, + {"filesystem:https://a.com/foo", &opaque_precursor_origin, true}, + // Unlike blob: URLs, filesystem: ones cannot be created in an unique + // opaque origin. + {"filesystem:https://a.com/foo", &opaque_unique_origin, false}, + + // file: URLs cannot result in regular web origins, regardless of + // opaqueness. + {"file:///etc/passwd", ®ular_origin, false}, + {"file:///etc/passwd", &opaque_precursor_origin, false}, + // However, they can result in regular file: origin and an opaque one + // containing another file: origin as precursor. + {"file:///etc/passwd", &file_origin, true}, + {"file:///etc/passwd", &file_opaque_precursor_origin, true}, + // It should not be possible to get an opaque unique origin for file: + // as it is a standard scheme and will always result in a tuple origin + // or will always be derived by other origin. + // Note: file:// URLs should become unique opaque origins at some point. + {"file:///etc/passwd", &opaque_unique_origin, false}, + + // The same set as above, but including a host. + {"file://a.com/etc/passwd", ®ular_origin, false}, + {"file://a.com/etc/passwd", &opaque_precursor_origin, false}, + {"file://a.com/etc/passwd", &file_host_origin, true}, + {"file://a.com/etc/passwd", &file_host_opaque_precursor_origin, true}, + {"file://a.com/etc/passwd", &opaque_unique_origin, false}, + + // Locally registered standard scheme should behave the same way + // as built-in standard schemes. + {"new-standard://host/foo", &new_standard_origin, true}, + {"new-standard://host/foo", &new_standard_opaque_precursor_origin, true}, + {"new-standard://host/foo", &opaque_unique_origin, false}, + {"new-standard://host2/foo", &new_standard_origin, false}, + {"new-standard://host2/foo", &new_standard_opaque_precursor_origin, + false}, + + // A non-standard scheme should never commit in an standard origin or + // opaque origin with standard precursor information. + {"non-standard-scheme://a.com/foo", ®ular_origin, false}, + {"non-standard-scheme://a.com/foo", &opaque_precursor_origin, false}, + // However, it should be fine to commit in unique opaque origins or in its + // own origin. + // Note: since non-standard scheme URLs don't parse out anything + // but the scheme, using a random different hostname here would work. + {"non-standard-scheme://b.com/foo2", &opaque_unique_origin, true}, + {"non-standard-scheme://b.com/foo3", &non_standard_scheme_origin, true}, + {"non-standard-scheme://b.com/foo4", + &non_standard_opaque_precursor_origin, true}, + + // No access scheme can only commit in opaque origin. + {"standard-but-noaccess://a.com/foo", ®ular_origin, false}, + {"standard-but-noaccess://a.com/foo", &opaque_precursor_origin, false}, + {"standard-but-noaccess://a.com/foo", &opaque_unique_origin, true}, + {"standard-but-noaccess://a.com/foo", &no_access_origin, true}, + {"standard-but-noaccess://a.com/foo", &no_access_opaque_precursor_origin, + true}, + {"standard-but-noaccess://b.com/foo", &no_access_origin, true}, + {"standard-but-noaccess://b.com/foo", &no_access_opaque_precursor_origin, + true}, + + // Local schemes can be non-standard, verify they also work as expected. + {"local-but-nonstandard://a.com", ®ular_origin, false}, + {"local-but-nonstandard://a.com", &opaque_precursor_origin, false}, + {"local-but-nonstandard://a.com", &opaque_unique_origin, true}, + {"local-but-nonstandard://a.com", &local_non_standard_origin, true}, + {"local-but-nonstandard://a.com", + &local_non_standard_opaque_precursor_origin, true}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() << "(origin, url): (" << *test_case.origin + << ", " << test_case.url << ")"); + EXPECT_EQ(test_case.expected_value, + test_case.origin->CanBeDerivedFrom(GURL(test_case.url))); + } +} + +TEST_F(OriginTest, GetDebugString) { + Origin http_origin = Origin::Create(GURL("http://192.168.9.1")); + EXPECT_STREQ(http_origin.GetDebugString().c_str(), "http://192.168.9.1"); + + Origin http_opaque_origin = http_origin.DeriveNewOpaqueOrigin(); + EXPECT_THAT( + http_opaque_origin.GetDebugString().c_str(), + ::testing::MatchesRegex( + "null \\[internally: \\(\\w*\\) derived from http://192.168.9.1\\]")); + EXPECT_THAT( + http_opaque_origin.GetDebugString(false /* include_nonce */).c_str(), + ::testing::MatchesRegex( + "null \\[internally: derived from http://192.168.9.1\\]")); + + Origin data_origin = Origin::Create(GURL("data:")); + EXPECT_STREQ(data_origin.GetDebugString().c_str(), + "null [internally: (nonce TBD) anonymous]"); + + // The nonce of the origin will be initialized if a new opaque origin is + // derived. + Origin data_derived_origin = data_origin.DeriveNewOpaqueOrigin(); + EXPECT_THAT( + data_derived_origin.GetDebugString().c_str(), + ::testing::MatchesRegex("null \\[internally: \\(\\w*\\) anonymous\\]")); + EXPECT_THAT( + data_derived_origin.GetDebugString(false /* include_nonce */).c_str(), + ::testing::MatchesRegex("null \\[internally: anonymous\\]")); + + Origin file_origin = Origin::Create(GURL("file:///etc/passwd")); + EXPECT_STREQ(file_origin.GetDebugString().c_str(), + "file:// [internally: file://]"); + + Origin file_server_origin = + Origin::Create(GURL("file://example.com/etc/passwd")); + EXPECT_STREQ(file_server_origin.GetDebugString().c_str(), + "file:// [internally: file://example.com]"); +} + +TEST_F(OriginTest, Deserialize) { + std::vector valid_urls = { + GURL("https://a.com"), GURL("http://a"), + GURL("http://a:80"), GURL("file://a.com/etc/passwd"), + GURL("file:///etc/passwd"), GURL("http://192.168.1.1"), + GURL("http://[2001:db8::1]/"), + }; + for (const GURL& url : valid_urls) { + SCOPED_TRACE(url.spec()); + Origin origin = Origin::Create(url); + absl::optional serialized = SerializeWithNonce(origin); + ASSERT_TRUE(serialized); + + absl::optional deserialized = Deserialize(std::move(*serialized)); + ASSERT_TRUE(deserialized.has_value()); + + EXPECT_TRUE(DoEqualityComparisons(origin, deserialized.value(), true)); + EXPECT_EQ(origin.GetDebugString(), deserialized.value().GetDebugString()); + } +} + +TEST_F(OriginTest, DeserializeInvalid) { + EXPECT_EQ(absl::nullopt, Deserialize(std::string())); + EXPECT_EQ(absl::nullopt, Deserialize("deadbeef")); + EXPECT_EQ(absl::nullopt, Deserialize("0123456789")); + EXPECT_EQ(absl::nullopt, Deserialize("https://a.com")); + EXPECT_EQ(absl::nullopt, Deserialize("https://192.168.1.1")); +} + +TEST_F(OriginTest, SerializeTBDNonce) { + std::vector invalid_urls = { + GURL("data:uniqueness"), GURL("data:,"), + GURL("data:text/html,Hello!"), GURL("javascript:alert(1)"), + GURL("about:blank"), GURL("google.com"), + }; + for (const GURL& url : invalid_urls) { + SCOPED_TRACE(url.spec()); + Origin origin = Origin::Create(url); + absl::optional serialized = SerializeWithNonce(origin); + absl::optional deserialized = Deserialize(std::move(*serialized)); + ASSERT_TRUE(deserialized.has_value()); + + // Can't use DoEqualityComparisons here since empty nonces are never == + // unless they are the same object. + EXPECT_EQ(origin.GetDebugString(), deserialized.value().GetDebugString()); + } + + { + // Same basic test as above, but without a GURL to create tuple_. + Origin opaque; + absl::optional serialized = SerializeWithNonce(opaque); + ASSERT_TRUE(serialized); + + absl::optional deserialized = Deserialize(std::move(*serialized)); + ASSERT_TRUE(deserialized.has_value()); + + // Can't use DoEqualityComparisons here since empty nonces are never == + // unless they are the same object. + EXPECT_EQ(opaque.GetDebugString(), deserialized.value().GetDebugString()); + } + + // Now force initialization of the nonce prior to serialization. + for (const GURL& url : invalid_urls) { + SCOPED_TRACE(url.spec()); + Origin origin = Origin::Create(url); + absl::optional serialized = + SerializeWithNonceAndInitIfNeeded(origin); + absl::optional deserialized = Deserialize(std::move(*serialized)); + ASSERT_TRUE(deserialized.has_value()); + + // The nonce should have been initialized prior to Serialization(). + EXPECT_EQ(origin, deserialized.value()); + } +} + +TEST_F(OriginTest, DeserializeValidNonce) { + Origin opaque; + GetNonce(opaque); + + absl::optional serialized = SerializeWithNonce(opaque); + ASSERT_TRUE(serialized); + + absl::optional deserialized = Deserialize(std::move(*serialized)); + ASSERT_TRUE(deserialized.has_value()); + + EXPECT_TRUE(DoEqualityComparisons(opaque, deserialized.value(), true)); + EXPECT_EQ(opaque.GetDebugString(), deserialized.value().GetDebugString()); +} + +TEST_F(OriginTest, IsSameOriginWith) { + url::Origin opaque_origin; + GURL foo_url = GURL("https://foo.com/path"); + url::Origin foo_origin = url::Origin::Create(foo_url); + GURL bar_url = GURL("https://bar.com/path"); + url::Origin bar_origin = url::Origin::Create(bar_url); + + EXPECT_FALSE(opaque_origin.IsSameOriginWith(foo_origin)); + EXPECT_FALSE(opaque_origin.IsSameOriginWith(foo_url)); + + EXPECT_TRUE(foo_origin.IsSameOriginWith(foo_origin)); + EXPECT_TRUE(foo_origin.IsSameOriginWith(foo_url)); + + EXPECT_FALSE(foo_origin.IsSameOriginWith(bar_origin)); + EXPECT_FALSE(foo_origin.IsSameOriginWith(bar_url)); + + // Documenting legacy behavior. This doesn't necessarily mean that the legacy + // behavior is correct (or desirable in the long-term). + EXPECT_FALSE(foo_origin.IsSameOriginWith(GURL("about:blank"))); + EXPECT_FALSE(foo_origin.IsSameOriginWith(GURL())); // Invalid GURL. + EXPECT_TRUE(foo_origin.IsSameOriginWith(GURL("blob:https://foo.com/guid"))); +} + +INSTANTIATE_TYPED_TEST_SUITE_P(UrlOrigin, + AbstractOriginTest, + UrlOriginTestTraits); + +} // namespace url diff --git a/run_all_perftests.cc b/run_all_perftests.cc new file mode 100644 index 00000000000..f11fd29ac00 --- /dev/null +++ b/run_all_perftests.cc @@ -0,0 +1,14 @@ +// Copyright 2019 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/functional/bind.h" +#include "base/test/launcher/unit_test_launcher.h" +#include "base/test/perf_test_suite.h" + +int main(int argc, char** argv) { + base::PerfTestSuite test_suite(argc, argv); + return base::LaunchUnitTestsSerially( + argc, argv, + base::BindOnce(&base::TestSuite::Run, base::Unretained(&test_suite))); +} diff --git a/run_all_unittests.cc b/run_all_unittests.cc new file mode 100644 index 00000000000..91f5613401e --- /dev/null +++ b/run_all_unittests.cc @@ -0,0 +1,27 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "base/functional/bind.h" +#include "base/test/launcher/unit_test_launcher.h" +#include "base/test/test_io_thread.h" +#include "base/test/test_suite.h" +#include "build/build_config.h" + +#if !BUILDFLAG(IS_IOS) +#include "mojo/core/embedder/embedder.h" // nogncheck +#endif + +int main(int argc, char** argv) { + base::TestSuite test_suite(argc, argv); + +#if !BUILDFLAG(IS_IOS) + mojo::core::Init(); +#endif + + return base::LaunchUnitTests( + argc, argv, + base::BindOnce(&base::TestSuite::Run, base::Unretained(&test_suite))); +} diff --git a/scheme_host_port.cc b/scheme_host_port.cc new file mode 100644 index 00000000000..490ae9a78c4 --- /dev/null +++ b/scheme_host_port.cc @@ -0,0 +1,278 @@ +// Copyright 2015 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/scheme_host_port.h" + +#include +#include + +#include +#include + +#include "base/check_op.h" +#include "base/containers/contains.h" +#include "base/notreached.h" +#include "base/numerics/safe_conversions.h" +#include "base/strings/string_number_conversions.h" +#include "base/strings/string_piece.h" +#include "url/gurl.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_constants.h" +#include "url/url_util.h" + +namespace url { + +namespace { + +bool IsCanonicalHost(const base::StringPiece& host) { + std::string canon_host; + + // Try to canonicalize the host (copy/pasted from net/base. :( ). + const Component raw_host_component(0, + base::checked_cast(host.length())); + StdStringCanonOutput canon_host_output(&canon_host); + CanonHostInfo host_info; + CanonicalizeHostVerbose(host.data(), raw_host_component, + &canon_host_output, &host_info); + + if (host_info.out_host.is_nonempty() && + host_info.family != CanonHostInfo::BROKEN) { + // Success! Assert that there's no extra garbage. + canon_host_output.Complete(); + DCHECK_EQ(host_info.out_host.len, static_cast(canon_host.length())); + } else { + // Empty host, or canonicalization failed. + canon_host.clear(); + } + + return host == canon_host; +} + +// Note: When changing IsValidInput, consider also updating +// ShouldTreatAsOpaqueOrigin in Blink (there might be existing differences in +// behavior between these 2 layers, but we should avoid introducing new +// differences). +bool IsValidInput(const base::StringPiece& scheme, + const base::StringPiece& host, + uint16_t port, + SchemeHostPort::ConstructPolicy policy) { + // Empty schemes are never valid. + if (scheme.empty()) + return false; + + // about:blank and other no-access schemes translate into an opaque origin. + // This helps consistency with ShouldTreatAsOpaqueOrigin in Blink. + if (base::Contains(GetNoAccessSchemes(), scheme)) + return false; + + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + bool is_standard = GetStandardSchemeType( + scheme.data(), + Component(0, base::checked_cast(scheme.length())), + &scheme_type); + if (!is_standard) { + // To be consistent with ShouldTreatAsOpaqueOrigin in Blink, local + // non-standard schemes are currently allowed to be tuple origins. + // Nonstandard schemes don't have hostnames, so their tuple is just + // ("protocol", "", 0). + // + // TODO: Migrate "content:" and "externalfile:" to be standard schemes, and + // remove this local scheme exception. + if (base::Contains(GetLocalSchemes(), scheme) && host.empty() && port == 0) + return true; + + // Otherwise, allow non-standard schemes only if the Android WebView + // workaround is enabled. + return AllowNonStandardSchemesForAndroidWebView(); + } + + switch (scheme_type) { + case SCHEME_WITH_HOST_AND_PORT: + case SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION: + // A URL with |scheme| is required to have the host and port, so return an + // invalid instance if host is not given. Note that a valid port is + // always provided by SchemeHostPort(const GURL&) constructor (a missing + // port is replaced with a default port if needed by + // GURL::EffectiveIntPort()). + if (host.empty()) + return false; + + // Don't do an expensive canonicalization if the host is already + // canonicalized. + DCHECK(policy == SchemeHostPort::CHECK_CANONICALIZATION || + IsCanonicalHost(host)); + if (policy == SchemeHostPort::CHECK_CANONICALIZATION && + !IsCanonicalHost(host)) { + return false; + } + + return true; + + case SCHEME_WITH_HOST: + if (port != 0) { + // Return an invalid object if a URL with the scheme never represents + // the port data but the given |port| is non-zero. + return false; + } + + // Don't do an expensive canonicalization if the host is already + // canonicalized. + DCHECK(policy == SchemeHostPort::CHECK_CANONICALIZATION || + IsCanonicalHost(host)); + if (policy == SchemeHostPort::CHECK_CANONICALIZATION && + !IsCanonicalHost(host)) { + return false; + } + + return true; + + case SCHEME_WITHOUT_AUTHORITY: + return false; + + default: + NOTREACHED(); + return false; + } +} + +} // namespace + +SchemeHostPort::SchemeHostPort() = default; + +SchemeHostPort::SchemeHostPort(std::string scheme, + std::string host, + uint16_t port, + ConstructPolicy policy) { + if (!IsValidInput(scheme, host, port, policy)) { + DCHECK(!IsValid()); + return; + } + + scheme_ = std::move(scheme); + host_ = std::move(host); + port_ = port; + DCHECK(IsValid()) << "Scheme: " << scheme_ << " Host: " << host_ + << " Port: " << port; +} + +SchemeHostPort::SchemeHostPort(base::StringPiece scheme, + base::StringPiece host, + uint16_t port) + : SchemeHostPort(std::string(scheme), + std::string(host), + port, + ConstructPolicy::CHECK_CANONICALIZATION) {} + +SchemeHostPort::SchemeHostPort(const GURL& url) { + if (!url.is_valid()) + return; + + base::StringPiece scheme = url.scheme_piece(); + base::StringPiece host = url.host_piece(); + + // A valid GURL never returns PORT_INVALID. + int port = url.EffectiveIntPort(); + if (port == PORT_UNSPECIFIED) { + port = 0; + } else { + DCHECK_GE(port, 0); + DCHECK_LE(port, 65535); + } + + if (!IsValidInput(scheme, host, port, ALREADY_CANONICALIZED)) + return; + + scheme_ = std::string(scheme); + host_ = std::string(host); + port_ = port; +} + +SchemeHostPort::~SchemeHostPort() = default; + +bool SchemeHostPort::IsValid() const { + // It suffices to just check |scheme_| for emptiness; the other fields are + // never present without it. + DCHECK(!scheme_.empty() || host_.empty()); + DCHECK(!scheme_.empty() || port_ == 0); + return !scheme_.empty(); +} + +std::string SchemeHostPort::Serialize() const { + // Null checking for |parsed| in SerializeInternal is probably slower than + // just filling it in and discarding it here. + url::Parsed parsed; + return SerializeInternal(&parsed); +} + +GURL SchemeHostPort::GetURL() const { + url::Parsed parsed; + std::string serialized = SerializeInternal(&parsed); + + if (!IsValid()) + return GURL(std::move(serialized), parsed, false); + + // SchemeHostPort does not have enough information to determine if an empty + // host is valid or not for the given scheme. Force re-parsing. + DCHECK(!scheme_.empty()); + if (host_.empty()) + return GURL(serialized); + + // If the serialized string is passed to GURL for parsing, it will append an + // empty path "/". Add that here. Note: per RFC 6454 we cannot do this for + // normal Origin serialization. + DCHECK(!parsed.path.is_valid()); + parsed.path = Component(serialized.length(), 1); + serialized.append("/"); + return GURL(std::move(serialized), parsed, true); +} + +bool SchemeHostPort::operator<(const SchemeHostPort& other) const { + return std::tie(port_, scheme_, host_) < + std::tie(other.port_, other.scheme_, other.host_); +} + +std::string SchemeHostPort::SerializeInternal(url::Parsed* parsed) const { + std::string result; + if (!IsValid()) + return result; + + // Reserve enough space for the "normal" case of scheme://host/. + result.reserve(scheme_.size() + host_.size() + 4); + + if (!scheme_.empty()) { + parsed->scheme = Component(0, scheme_.length()); + result.append(scheme_); + } + + result.append(kStandardSchemeSeparator); + + if (!host_.empty()) { + parsed->host = Component(result.length(), host_.length()); + result.append(host_); + } + + // Omit the port component if the port matches with the default port + // defined for the scheme, if any. + int default_port = DefaultPortForScheme(scheme_.data(), + static_cast(scheme_.length())); + if (default_port == PORT_UNSPECIFIED) + return result; + if (port_ != default_port) { + result.push_back(':'); + std::string port(base::NumberToString(port_)); + parsed->port = Component(result.length(), port.length()); + result.append(std::move(port)); + } + + return result; +} + +std::ostream& operator<<(std::ostream& out, + const SchemeHostPort& scheme_host_port) { + return out << scheme_host_port.Serialize(); +} + +} // namespace url diff --git a/scheme_host_port.h b/scheme_host_port.h new file mode 100644 index 00000000000..a98e7affdb6 --- /dev/null +++ b/scheme_host_port.h @@ -0,0 +1,173 @@ +// Copyright 2015 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_SCHEME_HOST_PORT_H_ +#define URL_SCHEME_HOST_PORT_H_ + +#include + +#include + +#include "base/component_export.h" +#include "base/strings/string_piece.h" + +class GURL; + +namespace url { + +struct Parsed; + +// This class represents a (scheme, host, port) tuple extracted from a URL. +// +// The primary purpose of this class is to represent relevant network-authority +// information for a URL. It is _not_ an Origin, as described in RFC 6454. In +// particular, it is generally NOT the right thing to use for security +// decisions. +// +// Instead, this class is a mechanism for simplifying URLs with standard schemes +// (that is, those which follow the generic syntax of RFC 3986) down to the +// uniquely identifying information necessary for network fetches. This makes it +// suitable as a cache key for a collection of active connections, for instance. +// It may, however, be inappropriate to use as a cache key for persistent +// storage associated with a host. +// +// In particular, note that: +// +// * SchemeHostPort can only represent schemes which follow the RFC 3986 syntax +// (e.g. those registered with GURL as "standard schemes"). Non-standard +// schemes such as "blob", "filesystem", "data", and "javascript" can only be +// represented as invalid SchemeHostPort objects. +// +// * For example, the "file" scheme follows the standard syntax, but it is +// important to note that the authority portion (host, port) is optional. +// URLs without an authority portion will be represented with an empty string +// for the host, and a port of 0 (e.g. "file:///etc/hosts" => +// ("file", "", 0)), and URLs with a host-only authority portion will be +// represented with a port of 0 (e.g. "file://example.com/etc/hosts" => +// ("file", "example.com", 0)). See Section 3 of RFC 3986 to better understand +// these constructs. +// +// * SchemeHostPort has no notion of the Origin concept (RFC 6454), and in +// particular, it has no notion of an opaque Origin. If you need to take +// opaque origins into account (and, if you're making security-relevant +// decisions then you absolutely do), please use 'url::Origin' instead. +// +// Usage: +// +// * SchemeHostPort objects are commonly created from GURL objects: +// +// GURL url("https://example.com/"); +// url::SchemeHostPort tuple(url); +// tuple.scheme(); // "https" +// tuple.host(); // "example.com" +// tuple.port(); // 443 +// +// * Objects may also be explicitly created and compared: +// +// url::SchemeHostPort tuple(url::kHttpsScheme, "example.com", 443); +// tuple.scheme(); // "https" +// tuple.host(); // "example.com" +// tuple.port(); // 443 +// +// GURL url("https://example.com/"); +// tuple == url::SchemeHostPort(url); // true +class COMPONENT_EXPORT(URL) SchemeHostPort { + public: + // Creates an invalid (scheme, host, port) tuple, which represents an invalid + // or non-standard URL. + SchemeHostPort(); + + // Creates a (scheme, host, port) tuple. |host| must be a canonicalized + // A-label (that is, '☃.net' must be provided as 'xn--n3h.net'). |scheme| + // must be a standard scheme. |port| must be 0 if |scheme| does not support + // ports (e.g. 'file'). + // + // Copies the data in |scheme| and |host|. + SchemeHostPort(base::StringPiece scheme, + base::StringPiece host, + uint16_t port); + + // Metadata influencing whether or not the constructor should sanity check + // host canonicalization. + enum ConstructPolicy { CHECK_CANONICALIZATION, ALREADY_CANONICALIZED }; + + // Creates a (scheme, host, port) tuple without performing sanity checking + // that the host and port are canonicalized. This should only be used when + // converting between already normalized types, and should NOT be used for + // IPC. + SchemeHostPort(std::string scheme, + std::string host, + uint16_t port, + ConstructPolicy policy); + + // Creates a (scheme, host, port) tuple from |url|, as described at + // https://tools.ietf.org/html/rfc6454#section-4 + // + // If |url| is invalid or non-standard, the result will be an invalid + // SchemeHostPort object. + explicit SchemeHostPort(const GURL& url); + + // Copyable and movable. + SchemeHostPort(const SchemeHostPort&) = default; + SchemeHostPort& operator=(const SchemeHostPort&) = default; + SchemeHostPort(SchemeHostPort&&) noexcept = default; + SchemeHostPort& operator=(SchemeHostPort&&) noexcept = default; + + ~SchemeHostPort(); + + // Returns the host component, in URL form. That is all IDN domain names will + // be expressed as A-Labels ('☃.net' will be returned as 'xn--n3h.net'), and + // and all IPv6 addresses will be enclosed in brackets ("[2001:db8::1]"). + const std::string& host() const { return host_; } + const std::string& scheme() const { return scheme_; } + uint16_t port() const { return port_; } + bool IsValid() const; + + // Serializes the SchemeHostPort tuple to a canonical form. + // + // While this string form resembles the Origin serialization specified in + // Section 6.2 of RFC 6454, it is important to note that invalid + // SchemeHostPort tuples serialize to the empty string, rather than being + // serialized as would an opaque Origin. + std::string Serialize() const; + + // Efficiently returns what GURL(Serialize()) would return, without needing to + // re-parse the URL. Note: this still performs allocations to copy data into + // GURL, so please avoid using this method if you only need to work on + // schemes, hosts, or ports individually. + // For example, see crrev.com/c/3637099/comments/782360d0_e14757be. + GURL GetURL() const; + + // Two SchemeHostPort objects are "equal" iff their schemes, hosts, and ports + // are exact matches. + // + // Note that this comparison is _not_ the same as an origin-based comparison. + // In particular, invalid SchemeHostPort objects match each other (and + // themselves). Opaque origins, on the other hand, would not. + bool operator==(const SchemeHostPort& other) const { + return port_ == other.port() && scheme_ == other.scheme() && + host_ == other.host(); + } + bool operator!=(const SchemeHostPort& other) const { + return !(*this == other); + } + // Allows SchemeHostPort to be used as a key in STL (for example, a std::set + // or std::map). + bool operator<(const SchemeHostPort& other) const; + + private: + std::string SerializeInternal(url::Parsed* parsed) const; + + std::string scheme_; + std::string host_; + uint16_t port_ = 0; +}; + +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& out, + const SchemeHostPort& scheme_host_port); + +} // namespace url + +#endif // URL_SCHEME_HOST_PORT_H_ diff --git a/scheme_host_port_unittest.cc b/scheme_host_port_unittest.cc new file mode 100644 index 00000000000..49bcf25362e --- /dev/null +++ b/scheme_host_port_unittest.cc @@ -0,0 +1,294 @@ +// Copyright 2015 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/scheme_host_port.h" + +#include +#include + +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/url_util.h" + +namespace { + +class SchemeHostPortTest : public testing::Test { + public: + SchemeHostPortTest() = default; + + SchemeHostPortTest(const SchemeHostPortTest&) = delete; + SchemeHostPortTest& operator=(const SchemeHostPortTest&) = delete; + + ~SchemeHostPortTest() override = default; + + private: + url::ScopedSchemeRegistryForTests scoped_registry_; +}; + +void ExpectParsedUrlsEqual(const GURL& a, const GURL& b) { + EXPECT_EQ(a, b); + const url::Parsed& a_parsed = a.parsed_for_possibly_invalid_spec(); + const url::Parsed& b_parsed = b.parsed_for_possibly_invalid_spec(); + EXPECT_EQ(a_parsed.scheme.begin, b_parsed.scheme.begin); + EXPECT_EQ(a_parsed.scheme.len, b_parsed.scheme.len); + EXPECT_EQ(a_parsed.username.begin, b_parsed.username.begin); + EXPECT_EQ(a_parsed.username.len, b_parsed.username.len); + EXPECT_EQ(a_parsed.password.begin, b_parsed.password.begin); + EXPECT_EQ(a_parsed.password.len, b_parsed.password.len); + EXPECT_EQ(a_parsed.host.begin, b_parsed.host.begin); + EXPECT_EQ(a_parsed.host.len, b_parsed.host.len); + EXPECT_EQ(a_parsed.port.begin, b_parsed.port.begin); + EXPECT_EQ(a_parsed.port.len, b_parsed.port.len); + EXPECT_EQ(a_parsed.path.begin, b_parsed.path.begin); + EXPECT_EQ(a_parsed.path.len, b_parsed.path.len); + EXPECT_EQ(a_parsed.query.begin, b_parsed.query.begin); + EXPECT_EQ(a_parsed.query.len, b_parsed.query.len); + EXPECT_EQ(a_parsed.ref.begin, b_parsed.ref.begin); + EXPECT_EQ(a_parsed.ref.len, b_parsed.ref.len); +} + +TEST_F(SchemeHostPortTest, Invalid) { + url::SchemeHostPort invalid; + EXPECT_EQ("", invalid.scheme()); + EXPECT_EQ("", invalid.host()); + EXPECT_EQ(0, invalid.port()); + EXPECT_FALSE(invalid.IsValid()); + EXPECT_EQ(invalid, invalid); + + const char* urls[] = { + // about:, data:, javascript: and other no-access schemes translate into + // an invalid SchemeHostPort + "about:blank", "about:blank#ref", "about:blank?query=123", "about:srcdoc", + "about:srcdoc#ref", "about:srcdoc?query=123", "data:text/html,Hello!", + "javascript:alert(1)", + + // GURLs where GURL::is_valid returns false translate into an invalid + // SchemeHostPort. + "file://example.com:443/etc/passwd", "#!^%!$!&*", + + // These schemes do not follow the generic URL syntax, so make sure we + // treat them as invalid (scheme, host, port) tuples (even though such + // URLs' _Origin_ might have a (scheme, host, port) tuple, they themselves + // do not). This is only *implicitly* checked in the code, by means of + // blob schemes not being standard, and filesystem schemes having type + // SCHEME_WITHOUT_AUTHORITY. If conditions change such that the implicit + // checks no longer hold, this policy should be made explicit. + "blob:https://example.com/uuid-goes-here", + "filesystem:https://example.com/temporary/yay.png"}; + + for (auto* test : urls) { + SCOPED_TRACE(test); + GURL url(test); + url::SchemeHostPort tuple(url); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_FALSE(tuple.IsValid()); + EXPECT_EQ(tuple, tuple); + EXPECT_EQ(tuple, invalid); + EXPECT_EQ(invalid, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, ExplicitConstruction) { + struct TestCases { + const char* scheme; + const char* host; + uint16_t port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"http", "example.com", 0}, // 0 is a valid port for http. + {"https", "example.com", 443}, + {"https", "example.com", 123}, + {"file", "", 0}, // 0 indicates "no port" for file: scheme. + {"file", "example.com", 0}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(test.scheme, test.host, test.port); + EXPECT_EQ(test.scheme, tuple.scheme()); + EXPECT_EQ(test.host, tuple.host()); + EXPECT_EQ(test.port, tuple.port()); + EXPECT_TRUE(tuple.IsValid()); + EXPECT_EQ(tuple, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, InvalidConstruction) { + struct TestCases { + const char* scheme; + const char* host; + uint16_t port; + } cases[] = {{"", "", 0}, + {"data", "", 0}, + {"blob", "", 0}, + {"filesystem", "", 0}, + {"http", "", 80}, + {"data", "example.com", 80}, + {"http", "☃.net", 80}, + {"http\nmore", "example.com", 80}, + {"http\rmore", "example.com", 80}, + {"http\n", "example.com", 80}, + {"http\r", "example.com", 80}, + {"http", "example.com\nnot-example.com", 80}, + {"http", "example.com\rnot-example.com", 80}, + {"http", "example.com\n", 80}, + {"http", "example.com\r", 80}, + {"file", "", 80}}; // Can''t have a port for file: scheme. + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(test.scheme, test.host, test.port); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_FALSE(tuple.IsValid()); + EXPECT_EQ(tuple, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, InvalidConstructionWithEmbeddedNulls) { + struct TestCases { + const char* scheme; + size_t scheme_length; + const char* host; + size_t host_length; + uint16_t port; + } cases[] = {{"http\0more", 9, "example.com", 11, 80}, + {"http\0", 5, "example.com", 11, 80}, + {"\0http", 5, "example.com", 11, 80}, + {"http", 4, "example.com\0not-example.com", 27, 80}, + {"http", 4, "example.com\0", 12, 80}, + {"http", 4, "\0example.com", 12, 80}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(std::string(test.scheme, test.scheme_length), + std::string(test.host, test.host_length), + test.port); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_FALSE(tuple.IsValid()); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, GURLConstruction) { + struct TestCases { + const char* url; + const char* scheme; + const char* host; + uint16_t port; + } cases[] = { + {"http://192.168.9.1/", "http", "192.168.9.1", 80}, + {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80}, + {"http://☃.net/", "http", "xn--n3h.net", 80}, + {"http://example.com/", "http", "example.com", 80}, + {"http://example.com:123/", "http", "example.com", 123}, + {"https://example.com/", "https", "example.com", 443}, + {"https://example.com:123/", "https", "example.com", 123}, + {"file:///etc/passwd", "file", "", 0}, + {"file://example.com/etc/passwd", "file", "example.com", 0}, + {"http://u:p@example.com/", "http", "example.com", 80}, + {"http://u:p@example.com/path", "http", "example.com", 80}, + {"http://u:p@example.com/path?123", "http", "example.com", 80}, + {"http://u:p@example.com/path?123#hash", "http", "example.com", 80}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(test.url); + GURL url(test.url); + EXPECT_TRUE(url.is_valid()); + url::SchemeHostPort tuple(url); + EXPECT_EQ(test.scheme, tuple.scheme()); + EXPECT_EQ(test.host, tuple.host()); + EXPECT_EQ(test.port, tuple.port()); + EXPECT_TRUE(tuple.IsValid()); + EXPECT_EQ(tuple, tuple); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, Serialization) { + struct TestCases { + const char* url; + const char* expected; + } cases[] = { + {"http://192.168.9.1/", "http://192.168.9.1"}, + {"http://[2001:db8::1]/", "http://[2001:db8::1]"}, + {"http://☃.net/", "http://xn--n3h.net"}, + {"http://example.com/", "http://example.com"}, + {"http://example.com:123/", "http://example.com:123"}, + {"https://example.com/", "https://example.com"}, + {"https://example.com:123/", "https://example.com:123"}, + {"file:///etc/passwd", "file://"}, + {"file://example.com/etc/passwd", "file://example.com"}, + {"https://example.com:0/", "https://example.com:0"}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(test.url); + GURL url(test.url); + url::SchemeHostPort tuple(url); + EXPECT_EQ(test.expected, tuple.Serialize()); + ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL()); + } +} + +TEST_F(SchemeHostPortTest, Comparison) { + // These tuples are arranged in increasing order: + struct SchemeHostPorts { + const char* scheme; + const char* host; + uint16_t port; + } tuples[] = { + {"http", "a", 80}, + {"http", "b", 80}, + {"https", "a", 80}, + {"https", "b", 80}, + {"http", "a", 81}, + {"http", "b", 81}, + {"https", "a", 81}, + {"https", "b", 81}, + }; + + for (size_t i = 0; i < std::size(tuples); i++) { + url::SchemeHostPort current(tuples[i].scheme, tuples[i].host, + tuples[i].port); + for (size_t j = i; j < std::size(tuples); j++) { + url::SchemeHostPort to_compare(tuples[j].scheme, tuples[j].host, + tuples[j].port); + EXPECT_EQ(i < j, current < to_compare) << i << " < " << j; + EXPECT_EQ(j < i, to_compare < current) << j << " < " << i; + } + } +} + +// Some schemes have optional authority. Make sure that GURL conversion from +// SchemeHostPort is not opinionated in that regard. For more info, See +// crbug.com/820194, where we considered all SchemeHostPorts with +// SCHEME_WITH_HOST (i.e., without ports) as valid with empty hosts, even though +// most are not (e.g. chrome URLs). +TEST_F(SchemeHostPortTest, EmptyHostGurlConversion) { + url::AddStandardScheme("chrome", url::SCHEME_WITH_HOST); + + GURL chrome_url("chrome:"); + EXPECT_FALSE(chrome_url.is_valid()); + + url::SchemeHostPort chrome_tuple("chrome", "", 0); + EXPECT_FALSE(chrome_tuple.GetURL().is_valid()); + ExpectParsedUrlsEqual(GURL(chrome_tuple.Serialize()), chrome_tuple.GetURL()); + ExpectParsedUrlsEqual(chrome_url, chrome_tuple.GetURL()); +} + +} // namespace url diff --git a/third_party/mozilla/LICENSE.txt b/third_party/mozilla/LICENSE.txt new file mode 100644 index 00000000000..ac40837824a --- /dev/null +++ b/third_party/mozilla/LICENSE.txt @@ -0,0 +1,65 @@ +Copyright 2007, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is +licensed separately as follows: + +The contents of this file are subject to the Mozilla Public License Version +1.1 (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at +http://www.mozilla.org/MPL/ + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +for the specific language governing rights and limitations under the +License. + +The Original Code is mozilla.org code. + +The Initial Developer of the Original Code is +Netscape Communications Corporation. +Portions created by the Initial Developer are Copyright (C) 1998 +the Initial Developer. All Rights Reserved. + +Contributor(s): + Darin Fisher (original author) + +Alternatively, the contents of this file may be used under the terms of +either the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +in which case the provisions of the GPL or the LGPL are applicable instead +of those above. If you wish to allow use of your version of this file only +under the terms of either the GPL or the LGPL, and not to allow others to +use your version of this file under the terms of the MPL, indicate your +decision by deleting the provisions above and replace them with the notice +and other provisions required by the GPL or the LGPL. If you do not delete +the provisions above, a recipient may use your version of this file under +the terms of any one of the MPL, the GPL or the LGPL. diff --git a/third_party/mozilla/README.chromium b/third_party/mozilla/README.chromium new file mode 100644 index 00000000000..ef396d3d1dd --- /dev/null +++ b/third_party/mozilla/README.chromium @@ -0,0 +1,8 @@ +Name: url_parse +URL: http://mxr.mozilla.org/comm-central/source/mozilla/netwerk/base/src/nsURLParsers.cpp +License: BSD and MPL 1.1/GPL 2.0/LGPL 2.1 +License File: LICENSE.txt + +Description: + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. diff --git a/third_party/mozilla/url_parse.cc b/third_party/mozilla/url_parse.cc new file mode 100644 index 00000000000..61fb94e98ff --- /dev/null +++ b/third_party/mozilla/url_parse.cc @@ -0,0 +1,963 @@ +/* Based on nsURLParsers.cc from Mozilla + * ------------------------------------- + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Darin Fisher (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "url/third_party/mozilla/url_parse.h" + +#include + +#include + +#include "base/check_op.h" +#include "url/url_parse_internal.h" +#include "url/url_util.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// Returns true if the given character is a valid digit to use in a port. +inline bool IsPortDigit(char16_t ch) { + return ch >= '0' && ch <= '9'; +} + +// Returns the offset of the next authority terminator in the input starting +// from start_offset. If no terminator is found, the return value will be equal +// to spec_len. +template +int FindNextAuthorityTerminator(const CHAR* spec, + int start_offset, + int spec_len) { + for (int i = start_offset; i < spec_len; i++) { + if (IsAuthorityTerminator(spec[i])) + return i; + } + return spec_len; // Not found. +} + +template +void ParseUserInfo(const CHAR* spec, + const Component& user, + Component* username, + Component* password) { + // Find the first colon in the user section, which separates the username and + // password. + int colon_offset = 0; + while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') + colon_offset++; + + if (colon_offset < user.len) { + // Found separator: : + *username = Component(user.begin, colon_offset); + *password = MakeRange(user.begin + colon_offset + 1, user.begin + user.len); + } else { + // No separator, treat everything as the username + *username = user; + *password = Component(); + } +} + +template +void ParseServerInfo(const CHAR* spec, + const Component& serverinfo, + Component* hostname, + Component* port_num) { + if (serverinfo.len == 0) { + // No server info, host name is empty. + hostname->reset(); + port_num->reset(); + return; + } + + // If the host starts with a left-bracket, assume the entire host is an + // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. + // This assumption will be overridden if we find a right-bracket. + // + // Our IPv6 address canonicalization code requires both brackets to exist, + // but the ability to locate an incomplete address can still be useful. + int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; + int colon = -1; + + // Find the last right-bracket, and the last colon. + for (int i = serverinfo.begin; i < serverinfo.end(); i++) { + switch (spec[i]) { + case ']': + ipv6_terminator = i; + break; + case ':': + colon = i; + break; + } + } + + if (colon > ipv6_terminator) { + // Found a port number: : + *hostname = MakeRange(serverinfo.begin, colon); + if (hostname->len == 0) + hostname->reset(); + *port_num = MakeRange(colon + 1, serverinfo.end()); + } else { + // No port: + *hostname = serverinfo; + port_num->reset(); + } +} + +// Given an already-identified auth section, breaks it into its consituent +// parts. The port number will be parsed and the resulting integer will be +// filled into the given *port variable, or -1 if there is no port number or it +// is invalid. +template +void DoParseAuthority(const CHAR* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DCHECK(auth.is_valid()) << "We should always get an authority"; + if (auth.len == 0) { + username->reset(); + password->reset(); + hostname->reset(); + port_num->reset(); + return; + } + + // Search backwards for @, which is the separator between the user info and + // the server info. + int i = auth.begin + auth.len - 1; + while (i > auth.begin && spec[i] != '@') + i--; + + if (spec[i] == '@') { + // Found user info: @ + ParseUserInfo(spec, Component(auth.begin, i - auth.begin), username, + password); + ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), hostname, + port_num); + } else { + // No user info, everything is server info. + username->reset(); + password->reset(); + ParseServerInfo(spec, auth, hostname, port_num); + } +} + +template +inline void FindQueryAndRefParts(const CHAR* spec, + const Component& path, + int* query_separator, + int* ref_separator) { + if constexpr (sizeof(*spec) == 1) { + // memchr is much faster than any scalar code we can write. + const CHAR* ptr = spec + path.begin; + const CHAR* first_hash = + reinterpret_cast(memchr(ptr, '#', path.len)); + size_t len_before_fragment = + first_hash == nullptr ? path.len : first_hash - ptr; + const CHAR* first_question = + reinterpret_cast(memchr(ptr, '?', len_before_fragment)); + if (first_hash != nullptr) { + *ref_separator = first_hash - spec; + } + if (first_question != nullptr) { + *query_separator = first_question - spec; + } + } else { + int path_end = path.begin + path.len; + for (int i = path.begin; i < path_end; i++) { + switch (spec[i]) { + case '?': + // Only match the query string if it precedes the reference fragment + // and when we haven't found one already. + if (*query_separator < 0) + *query_separator = i; + break; + case '#': + // Record the first # sign only. + if (*ref_separator < 0) { + *ref_separator = i; + return; + } + break; + } + } + } +} + +template +void ParsePath(const CHAR* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + // path = [/]//<...>/;?# + + // Special case when there is no path. + if (path.len == -1) { + filepath->reset(); + query->reset(); + ref->reset(); + return; + } + DCHECK(path.is_nonempty()) << "We should never have 0 length paths"; + + // Search for first occurrence of either ? or #. + int query_separator = -1; // Index of the '?' + int ref_separator = -1; // Index of the '#' + FindQueryAndRefParts(spec, path, &query_separator, &ref_separator); + + // Markers pointing to the character after each of these corresponding + // components. The code below words from the end back to the beginning, + // and will update these indices as it finds components that exist. + int file_end, query_end; + + // Ref fragment: from the # to the end of the path. + int path_end = path.begin + path.len; + if (ref_separator >= 0) { + file_end = query_end = ref_separator; + *ref = MakeRange(ref_separator + 1, path_end); + } else { + file_end = query_end = path_end; + ref->reset(); + } + + // Query fragment: everything from the ? to the next boundary (either the end + // of the path or the ref fragment). + if (query_separator >= 0) { + file_end = query_separator; + *query = MakeRange(query_separator + 1, query_end); + } else { + query->reset(); + } + + // File path: treat an empty file path as no file path. + if (file_end != path.begin) + *filepath = MakeRange(path.begin, file_end); + else + filepath->reset(); +} + +template +bool DoExtractScheme(const CHAR* url, int url_len, Component* scheme) { + // Skip leading whitespace and control characters. + int begin = 0; + while (begin < url_len && ShouldTrimFromURL(url[begin])) + begin++; + if (begin == url_len) + return false; // Input is empty or all whitespace. + + // Find the first colon character. + for (int i = begin; i < url_len; i++) { + if (url[i] == ':') { + *scheme = MakeRange(begin, i); + return true; + } + } + return false; // No colon found: no scheme +} + +// Fills in all members of the Parsed structure except for the scheme. +// +// |spec| is the full spec being parsed, of length |spec_len|. +// |after_scheme| is the character immediately following the scheme (after the +// colon) where we'll begin parsing. +// +// Compatability data points. I list "host", "path" extracted: +// Input IE6 Firefox Us +// ----- -------------- -------------- -------------- +// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" +// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" +// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// +// (*) Interestingly, although IE fails to load these URLs, its history +// canonicalizer handles them, meaning if you've been to the corresponding +// "http://foo.com/" link, it will be colored. +template +void DoParseAfterScheme(const CHAR* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + int after_slashes = after_scheme + num_slashes; + + // First split into two main parts, the authority (username, password, host, + // and port) and the full path (path, query, and reference). + Component authority; + Component full_path; + + // Found "//", looks like an authority section. Treat everything + // from there to the next slash (or end of spec) to be the authority. Note + // that we ignore the number of slashes and treat it as the authority. + int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); + authority = Component(after_slashes, end_auth - after_slashes); + + if (end_auth == spec_len) // No beginning of path found. + full_path = Component(); + else // Everything starting from the slash to the end is the path. + full_path = Component(end_auth, spec_len - end_auth); + + // Now parse those two sub-parts. + DoParseAuthority(spec, authority, &parsed->username, &parsed->password, + &parsed->host, &parsed->port); + ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); +} + +// The main parsing function for standard URLs. Standard URLs have a scheme, +// host, path, etc. +template +void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + int after_scheme; + if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { + after_scheme = parsed->scheme.end() + 1; // Skip past the colon. + } else { + // Say there's no scheme when there is no colon. We could also say that + // everything is the scheme. Both would produce an invalid URL, but this way + // seems less wrong in more cases. + parsed->scheme.reset(); + after_scheme = begin; + } + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +template +void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the unused parts of the URL out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->path.reset(); // May use this; reset for convenience. + parsed->ref.reset(); // May use this; reset for convenience. + parsed->query.reset(); // May use this; reset for convenience. + parsed->clear_inner_parsed(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + return; + } + + int inner_start = -1; + + // Extract the scheme. We also handle the case where there is no scheme. + if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() == spec_len - 1) + return; + + inner_start = parsed->scheme.end() + 1; + } else { + // No scheme found; that's not valid for filesystem URLs. + parsed->scheme.reset(); + return; + } + + Component inner_scheme; + const CHAR* inner_spec = &spec[inner_start]; + int inner_spec_len = spec_len - inner_start; + + if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { + // Offset the results since we gave ExtractScheme a substring. + inner_scheme.begin += inner_start; + + if (inner_scheme.end() == spec_len - 1) + return; + } else { + // No scheme found; that's not valid for filesystem URLs. + // The best we can do is return "filesystem://". + return; + } + + Parsed inner_parsed; + + if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) { + // File URLs are special. + ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); + } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) { + // Filesystem URLs don't nest. + return; + } else if (IsStandard(spec, inner_scheme)) { + // All "normal" URLs. + DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); + } else { + return; + } + + // All members of inner_parsed need to be offset by inner_start. + // If we had any scheme that supported nesting more than one level deep, + // we'd have to recurse into the inner_parsed's inner_parsed when + // adjusting by inner_start. + inner_parsed.scheme.begin += inner_start; + inner_parsed.username.begin += inner_start; + inner_parsed.password.begin += inner_start; + inner_parsed.host.begin += inner_start; + inner_parsed.port.begin += inner_start; + inner_parsed.query.begin += inner_start; + inner_parsed.ref.begin += inner_start; + inner_parsed.path.begin += inner_start; + + // Query and ref move from inner_parsed to parsed. + parsed->query = inner_parsed.query; + inner_parsed.query.reset(); + parsed->ref = inner_parsed.ref; + inner_parsed.ref.reset(); + + parsed->set_inner_parsed(inner_parsed); + if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || + inner_parsed.inner_parsed()) { + return; + } + + // The path in inner_parsed should start with a slash, then have a filesystem + // type followed by a slash. From the first slash up to but excluding the + // second should be what it keeps; the rest goes to parsed. If the path ends + // before the second slash, it's still pretty clear what the user meant, so + // we'll let that through. + if (!IsURLSlash(spec[inner_parsed.path.begin])) { + return; + } + int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash + while (inner_path_end < spec_len && !IsURLSlash(spec[inner_path_end])) + ++inner_path_end; + parsed->path.begin = inner_path_end; + int new_inner_path_length = inner_path_end - inner_parsed.path.begin; + parsed->path.len = inner_parsed.path.len - new_inner_path_length; + parsed->inner_parsed()->path.len = new_inner_path_length; +} + +// Initializes a path URL which is merely a scheme followed by a path. Examples +// include "about:foo" and "javascript:alert('bar');" +template +void DoParsePathURL(const CHAR* spec, + int spec_len, + bool trim_path_end, + Parsed* parsed) { + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->path.reset(); + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int scheme_begin = 0; + TrimURL(spec, &scheme_begin, &spec_len, trim_path_end); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (scheme_begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin; + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin, + &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += scheme_begin; + path_begin = parsed->scheme.end() + 1; + } else { + // No scheme case. + parsed->scheme.reset(); + path_begin = scheme_begin; + } + + if (path_begin == spec_len) + return; + DCHECK_LT(path_begin, spec_len); + + ParsePath(spec, MakeRange(path_begin, spec_len), &parsed->path, + &parsed->query, &parsed->ref); +} + +template +void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->ref.reset(); + parsed->query.reset(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin = -1; + int path_end = -1; + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() != spec_len - 1) { + path_begin = parsed->scheme.end() + 1; + path_end = spec_len; + } + } else { + // No scheme found, just path. + parsed->scheme.reset(); + path_begin = begin; + path_end = spec_len; + } + + // Split [path_begin, path_end) into a path + query. + for (int i = path_begin; i < path_end; ++i) { + if (spec[i] == '?') { + parsed->query = MakeRange(i + 1, path_end); + path_end = i; + break; + } + } + + // For compatability with the standard URL parser, treat no path as + // -1, rather than having a length of 0 + if (path_begin == path_end) { + parsed->path.reset(); + } else { + parsed->path = MakeRange(path_begin, path_end); + } +} + +// Converts a port number in a string to an integer. We'd like to just call +// sscanf but our input is not NULL-terminated, which sscanf requires. Instead, +// we copy the digits to a small stack buffer (since we know the maximum number +// of digits in a valid port number) that we can NULL terminate. +template +int DoParsePort(const CHAR* spec, const Component& component) { + // Easy success case when there is no port. + const int kMaxDigits = 5; + if (component.is_empty()) + return PORT_UNSPECIFIED; + + // Skip over any leading 0s. + Component digits_comp(component.end(), 0); + for (int i = 0; i < component.len; i++) { + if (spec[component.begin + i] != '0') { + digits_comp = MakeRange(component.begin + i, component.end()); + break; + } + } + if (digits_comp.len == 0) + return 0; // All digits were 0. + + // Verify we don't have too many digits (we'll be copying to our buffer so + // we need to double-check). + if (digits_comp.len > kMaxDigits) + return PORT_INVALID; + + // Copy valid digits to the buffer. + char digits[kMaxDigits + 1]; // +1 for null terminator + for (int i = 0; i < digits_comp.len; i++) { + CHAR ch = spec[digits_comp.begin + i]; + if (!IsPortDigit(ch)) { + // Invalid port digit, fail. + return PORT_INVALID; + } + digits[i] = static_cast(ch); + } + + // Null-terminate the string and convert to integer. Since we guarantee + // only digits, atoi's lack of error handling is OK. + digits[digits_comp.len] = 0; + int port = atoi(digits); + if (port > 65535) + return PORT_INVALID; // Out of range. + return port; +} + +template +void DoExtractFileName(const CHAR* spec, + const Component& path, + Component* file_name) { + // Handle empty paths: they have no file names. + if (path.is_empty()) { + file_name->reset(); + return; + } + + // Extract the filename range from the path which is between + // the last slash and the following semicolon. + int file_end = path.end(); + for (int i = path.end() - 1; i >= path.begin; i--) { + if (spec[i] == ';') { + file_end = i; + } else if (IsURLSlash(spec[i])) { + // File name is everything following this character to the end + *file_name = MakeRange(i + 1, file_end); + return; + } + } + + // No slash found, this means the input was degenerate (generally paths + // will start with a slash). Let's call everything the file name. + *file_name = MakeRange(path.begin, file_end); + return; +} + +template +bool DoExtractQueryKeyValue(const CHAR* spec, + Component* query, + Component* key, + Component* value) { + if (!query->is_nonempty()) + return false; + + int start = query->begin; + int cur = start; + int end = query->end(); + + // We assume the beginning of the input is the beginning of the "key" and we + // skip to the end of it. + key->begin = cur; + while (cur < end && spec[cur] != '&' && spec[cur] != '=') + cur++; + key->len = cur - key->begin; + + // Skip the separator after the key (if any). + if (cur < end && spec[cur] == '=') + cur++; + + // Find the value part. + value->begin = cur; + while (cur < end && spec[cur] != '&') + cur++; + value->len = cur - value->begin; + + // Finally skip the next separator if any + if (cur < end && spec[cur] == '&') + cur++; + + // Save the new query + *query = MakeRange(cur, end); + return true; +} + +} // namespace + +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& os, const Component& component) { + return os << '{' << component.begin << ", " << component.len << "}"; +} + +Parsed::Parsed() : potentially_dangling_markup(false), inner_parsed_(NULL) {} + +Parsed::Parsed(const Parsed& other) + : scheme(other.scheme), + username(other.username), + password(other.password), + host(other.host), + port(other.port), + path(other.path), + query(other.query), + ref(other.ref), + potentially_dangling_markup(other.potentially_dangling_markup), + inner_parsed_(NULL) { + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); +} + +Parsed& Parsed::operator=(const Parsed& other) { + if (this != &other) { + scheme = other.scheme; + username = other.username; + password = other.password; + host = other.host; + port = other.port; + path = other.path; + query = other.query; + ref = other.ref; + potentially_dangling_markup = other.potentially_dangling_markup; + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); + else + clear_inner_parsed(); + } + return *this; +} + +Parsed::~Parsed() { + delete inner_parsed_; +} + +int Parsed::Length() const { + if (ref.is_valid()) + return ref.end(); + return CountCharactersBefore(REF, false); +} + +int Parsed::CountCharactersBefore(ComponentType type, + bool include_delimiter) const { + if (type == SCHEME) + return scheme.begin; + + // There will be some characters after the scheme like "://" and we don't + // know how many. Search forwards for the next thing until we find one. + int cur = 0; + if (scheme.is_valid()) + cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. + + if (username.is_valid()) { + if (type <= USERNAME) + return username.begin; + cur = username.end() + 1; // Advance over the '@' or ':' at the end. + } + + if (password.is_valid()) { + if (type <= PASSWORD) + return password.begin; + cur = password.end() + 1; // Advance over the '@' at the end. + } + + if (host.is_valid()) { + if (type <= HOST) + return host.begin; + cur = host.end(); + } + + if (port.is_valid()) { + if (type < PORT || (type == PORT && include_delimiter)) + return port.begin - 1; // Back over delimiter. + if (type == PORT) + return port.begin; // Don't want delimiter counted. + cur = port.end(); + } + + if (path.is_valid()) { + if (type <= PATH) + return path.begin; + cur = path.end(); + } + + if (query.is_valid()) { + if (type < QUERY || (type == QUERY && include_delimiter)) + return query.begin - 1; // Back over delimiter. + if (type == QUERY) + return query.begin; // Don't want delimiter counted. + cur = query.end(); + } + + if (ref.is_valid()) { + if (type == REF && !include_delimiter) + return ref.begin; // Back over delimiter. + + // When there is a ref and we get here, the component we wanted was before + // this and not found, so we always know the beginning of the ref is right. + return ref.begin - 1; // Don't want delimiter counted. + } + + return cur; +} + +Component Parsed::GetContent() const { + const int begin = CountCharactersBefore(USERNAME, false); + const int len = Length() - begin; + // For compatability with the standard URL parser, we treat no content as + // -1, rather than having a length of 0 (we normally wouldn't care so + // much for these non-standard URLs). + return len ? Component(begin, len) : Component(); +} + +bool ExtractScheme(const char* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +bool ExtractScheme(const char16_t* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +// This handles everything that may be an authority terminator, including +// backslash. For special backslash handling see DoParseAfterScheme. +bool IsAuthorityTerminator(char16_t ch) { + return IsURLSlash(ch) || ch == '?' || ch == '#'; +} + +void ExtractFileName(const char* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +void ExtractFileName(const char16_t* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +bool ExtractQueryKeyValue(const char16_t* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +void ParseAuthority(const char16_t* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +int ParsePort(const char* url, const Component& port) { + return DoParsePort(url, port); +} + +int ParsePort(const char16_t* url, const Component& port) { + return DoParsePort(url, port); +} + +void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParseStandardURL(const char16_t* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed) { + DoParsePathURL(url, url_len, trim_path_end, parsed); +} + +void ParsePathURL(const char16_t* url, + int url_len, + bool trim_path_end, + Parsed* parsed) { + DoParsePathURL(url, url_len, trim_path_end, parsed); +} + +void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileSystemURL(url, url_len, parsed); +} + +void ParseFileSystemURL(const char16_t* url, int url_len, Parsed* parsed) { + DoParseFileSystemURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char16_t* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParsePathInternal(const char16_t* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +void ParseAfterScheme(const char16_t* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +} // namespace url diff --git a/third_party/mozilla/url_parse.h b/third_party/mozilla/url_parse.h new file mode 100644 index 00000000000..9e824bae201 --- /dev/null +++ b/third_party/mozilla/url_parse.h @@ -0,0 +1,377 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ +#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ + +#include + +#include "base/component_export.h" + +namespace url { + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { + return begin + len; + } + + // Returns true if this component is valid, meaning the length is given. + // Valid components may be empty to record the fact that they exist. + bool is_valid() const { return len >= 0; } + + // Determine if the component is empty or not. Empty means the length is + // zero or the component is invalid. + bool is_empty() const { return len <= 0; } + bool is_nonempty() const { return len > 0; } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Permit printing Components by CHECK macros. +COMPONENT_EXPORT(URL) +std::ostream& operator<<(std::ostream& os, const Component& component); + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// Parsed parsed; +// Component scheme; +// if (!ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// ParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// ParseFileURL(url, url_len, &parsed); +// else +// ParsePathURL(url, url_len, &parsed); +// +struct COMPONENT_EXPORT(URL) Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF, + }; + + // The default constructor is sufficient for the components, but inner_parsed_ + // requires special handling. + Parsed(); + Parsed(const Parsed&); + Parsed& operator=(const Parsed&); + ~Parsed(); + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + int CountCharactersBefore(ComponentType type, bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name, stopping at the query of + // ref delimiter (if any). Length will be -1 if unspecified. This includes + // the preceeding slash, so the path on http://www.google.com/asdf" is + // "/asdf". As a result, it is impossible to have a 0 length path, it will + // be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; + + // The URL spec from the character after the scheme: until the end of the + // URL, regardless of the scheme. This is mostly useful for 'opaque' non- + // hierarchical schemes like data: and javascript: as a convient way to get + // the string with the scheme stripped off. + Component GetContent() const; + + // True if the URL's source contained a raw `<` character, and whitespace was + // removed from the URL during parsing + // + // TODO(mkwst): Link this to something in a spec if + // https://github.com/whatwg/url/pull/284 lands. + bool potentially_dangling_markup; + + // This is used for nested URL types, currently only filesystem. If you + // parse a filesystem URL, the resulting Parsed will have a nested + // inner_parsed_ to hold the parsed inner URL's component information. + // For all other url types [including the inner URL], it will be NULL. + Parsed* inner_parsed() const { + return inner_parsed_; + } + + void set_inner_parsed(const Parsed& inner_parsed) { + if (!inner_parsed_) + inner_parsed_ = new Parsed(inner_parsed); + else + *inner_parsed_ = inner_parsed; + } + + void clear_inner_parsed() { + if (inner_parsed_) { + delete inner_parsed_; + inner_parsed_ = nullptr; + } + } + + private: + Parsed* inner_parsed_; // This object is owned and managed by this struct. +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +COMPONENT_EXPORT(URL) +void ParseStandardURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseStandardURL(const char16_t* url, int url_len, Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +COMPONENT_EXPORT(URL) +void ParsePathURL(const char* url, + int url_len, + bool trim_path_end, + Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParsePathURL(const char16_t* url, + int url_len, + bool trim_path_end, + Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +COMPONENT_EXPORT(URL) +void ParseFileURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseFileURL(const char16_t* url, int url_len, Parsed* parsed); + +// Filesystem URLs are structured differently than other URLs. +COMPONENT_EXPORT(URL) +void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseFileSystemURL(const char16_t* url, int url_len, Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +COMPONENT_EXPORT(URL) +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +COMPONENT_EXPORT(URL) +void ParseMailtoURL(const char16_t* url, int url_len, Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool ExtractScheme(const char* url, int url_len, Component* scheme); +COMPONENT_EXPORT(URL) +bool ExtractScheme(const char16_t* url, int url_len, Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +COMPONENT_EXPORT(URL) bool IsAuthorityTerminator(char16_t ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +COMPONENT_EXPORT(URL) +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +COMPONENT_EXPORT(URL) +void ParseAuthority(const char16_t* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +COMPONENT_EXPORT(URL) int ParsePort(const char* url, const Component& port); +COMPONENT_EXPORT(URL) +int ParsePort(const char16_t* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +COMPONENT_EXPORT(URL) +void ExtractFileName(const char16_t* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +COMPONENT_EXPORT(URL) +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +COMPONENT_EXPORT(URL) +bool ExtractQueryKeyValue(const char16_t* url, + Component* query, + Component* key, + Component* value); + +} // namespace url + +#endif // URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ diff --git a/url_canon.cc b/url_canon.cc new file mode 100644 index 00000000000..bbacaa7cdc6 --- /dev/null +++ b/url_canon.cc @@ -0,0 +1,15 @@ +// Copyright 2017 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon.h" + +#include "base/component_export.h" + +namespace url { + +template class EXPORT_TEMPLATE_DEFINE(COMPONENT_EXPORT(URL)) CanonOutputT; +template class EXPORT_TEMPLATE_DEFINE(COMPONENT_EXPORT(URL)) + CanonOutputT; + +} // namespace url diff --git a/url_canon.h b/url_canon.h new file mode 100644 index 00000000000..94b44426fa3 --- /dev/null +++ b/url_canon.h @@ -0,0 +1,1037 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_H_ +#define URL_URL_CANON_H_ + +#include +#include + +#include "base/component_export.h" +#include "base/export_template.h" +#include "base/memory/raw_ptr_exclusion.h" +#include "base/numerics/clamped_math.h" +#include "url/third_party/mozilla/url_parse.h" + +namespace url { + +// Canonicalizer output ------------------------------------------------------- + +// Base class for the canonicalizer output, this maintains a buffer and +// supports simple resizing and append operations on it. +// +// It is VERY IMPORTANT that no virtual function calls be made on the common +// code path. We only have two virtual function calls, the destructor and a +// resize function that is called when the existing buffer is not big enough. +// The derived class is then in charge of setting up our buffer which we will +// manage. +template +class CanonOutputT { + public: + CanonOutputT() = default; + virtual ~CanonOutputT() = default; + + // Implemented to resize the buffer. This function should update the buffer + // pointer to point to the new buffer, and any old data up to |cur_len_| in + // the buffer must be copied over. + // + // The new size |sz| must be larger than buffer_len_. + virtual void Resize(size_t sz) = 0; + + // Accessor for returning a character at a given position. The input offset + // must be in the valid range. + inline T at(size_t offset) const { return buffer_[offset]; } + + // Sets the character at the given position. The given position MUST be less + // than the length(). + inline void set(size_t offset, T ch) { buffer_[offset] = ch; } + + // Returns the number of characters currently in the buffer. + inline size_t length() const { return cur_len_; } + + // Returns the current capacity of the buffer. The length() is the number of + // characters that have been declared to be written, but the capacity() is + // the number that can be written without reallocation. If the caller must + // write many characters at once, it can make sure there is enough capacity, + // write the data, then use set_size() to declare the new length(). + size_t capacity() const { return buffer_len_; } + + // Called by the user of this class to get the output. The output will NOT + // be NULL-terminated. Call length() to get the + // length. + const T* data() const { return buffer_; } + T* data() { return buffer_; } + + // Shortens the URL to the new length. Used for "backing up" when processing + // relative paths. This can also be used if an external function writes a lot + // of data to the buffer (when using the "Raw" version below) beyond the end, + // to declare the new length. + // + // This MUST NOT be used to expand the size of the buffer beyond capacity(). + void set_length(size_t new_len) { cur_len_ = new_len; } + + // This is the most performance critical function, since it is called for + // every character. + void push_back(T ch) { + // In VC2005, putting this common case first speeds up execution + // dramatically because this branch is predicted as taken. + if (cur_len_ < buffer_len_) { + buffer_[cur_len_] = ch; + cur_len_++; + return; + } + + // Grow the buffer to hold at least one more item. Hopefully we won't have + // to do this very often. + if (!Grow(1)) + return; + + // Actually do the insertion. + buffer_[cur_len_] = ch; + cur_len_++; + } + + // Appends the given string to the output. + void Append(const T* str, size_t str_len) { + if (str_len > buffer_len_ - cur_len_) { + if (!Grow(str_len - (buffer_len_ - cur_len_))) + return; + } + memcpy(buffer_ + cur_len_, str, str_len * sizeof(T)); + cur_len_ += str_len; + } + + void ReserveSizeIfNeeded(size_t estimated_size) { + // Reserve a bit extra to account for escaped chars. + if (estimated_size > buffer_len_) + Resize((base::ClampedNumeric(estimated_size) + 8).RawValue()); + } + + protected: + // Grows the given buffer so that it can fit at least |min_additional| + // characters. Returns true if the buffer could be resized, false on OOM. + bool Grow(size_t min_additional) { + static const size_t kMinBufferLen = 16; + size_t new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_; + do { + if (new_len >= (1 << 30)) // Prevent overflow below. + return false; + new_len *= 2; + } while (new_len < buffer_len_ + min_additional); + Resize(new_len); + return true; + } + + // `buffer_` is not a raw_ptr<...> for performance reasons (based on analysis + // of sampling profiler data). + RAW_PTR_EXCLUSION T* buffer_ = nullptr; + size_t buffer_len_ = 0; + + // Used characters in the buffer. + size_t cur_len_ = 0; +}; + +// Simple implementation of the CanonOutput using new[]. This class +// also supports a static buffer so if it is allocated on the stack, most +// URLs can be canonicalized with no heap allocations. +template +class RawCanonOutputT : public CanonOutputT { + public: + RawCanonOutputT() : CanonOutputT() { + this->buffer_ = fixed_buffer_; + this->buffer_len_ = fixed_capacity; + } + ~RawCanonOutputT() override { + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + } + + void Resize(size_t sz) override { + T* new_buf = new T[sz]; + memcpy(new_buf, this->buffer_, + sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz)); + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + this->buffer_ = new_buf; + this->buffer_len_ = sz; + } + + protected: + T fixed_buffer_[fixed_capacity]; +}; + +// Explicitely instantiate commonly used instatiations. +extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL)) + CanonOutputT; +extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL)) + CanonOutputT; + +// Normally, all canonicalization output is in narrow characters. We support +// the templates so it can also be used internally if a wide buffer is +// required. +typedef CanonOutputT CanonOutput; +typedef CanonOutputT CanonOutputW; + +template +class RawCanonOutput : public RawCanonOutputT {}; +template +class RawCanonOutputW : public RawCanonOutputT {}; + +// Character set converter ---------------------------------------------------- +// +// Converts query strings into a custom encoding. The embedder can supply an +// implementation of this class to interface with their own character set +// conversion libraries. +// +// Embedders will want to see the unit test for the ICU version. + +class COMPONENT_EXPORT(URL) CharsetConverter { + public: + CharsetConverter() {} + virtual ~CharsetConverter() {} + + // Converts the given input string from UTF-16 to whatever output format the + // converter supports. This is used only for the query encoding conversion, + // which does not fail. Instead, the converter should insert "invalid + // character" characters in the output for invalid sequences, and do the + // best it can. + // + // If the input contains a character not representable in the output + // character set, the converter should append the HTML entity sequence in + // decimal, (such as "你") with escaping of the ampersand, number + // sign, and semicolon (in the previous example it would be + // "%26%2320320%3B"). This rule is based on what IE does in this situation. + virtual void ConvertFromUTF16(const char16_t* input, + int input_len, + CanonOutput* output) = 0; +}; + +// Schemes -------------------------------------------------------------------- + +// Types of a scheme representing the requirements on the data represented by +// the authority component of a URL with the scheme. +enum SchemeType { + // The authority component of a URL with the scheme has the form + // "username:password@host:port". The username and password entries are + // optional; the host may not be empty. The default value of the port can be + // omitted in serialization. This type occurs with network schemes like http, + // https, and ftp. + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, + // The authority component of a URL with the scheme has the form "host:port", + // and does not include username or password. The default value of the port + // can be omitted in serialization. Used by inner URLs of filesystem URLs of + // origins with network hosts, from which the username and password are + // stripped. + SCHEME_WITH_HOST_AND_PORT, + // The authority component of an URL with the scheme has the form "host", and + // does not include port, username, or password. Used when the hosts are not + // network addresses; for example, schemes used internally by the browser. + SCHEME_WITH_HOST, + // A URL with the scheme doesn't have the authority component. + SCHEME_WITHOUT_AUTHORITY, +}; + +// Whitespace ----------------------------------------------------------------- + +// Searches for whitespace that should be removed from the middle of URLs, and +// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces +// are preserved, which is what most browsers do. A pointer to the output will +// be returned, and the length of that output will be in |output_len|. +// +// This should be called before parsing if whitespace removal is desired (which +// it normally is when you are canonicalizing). +// +// If no whitespace is removed, this function will not use the buffer and will +// return a pointer to the input, to avoid the extra copy. If modification is +// required, the given |buffer| will be used and the returned pointer will +// point to the beginning of the buffer. +// +// Therefore, callers should not use the buffer, since it may actually be empty, +// use the computed pointer and |*output_len| instead. +// +// If |input| contained both removable whitespace and a raw `<` character, +// |potentially_dangling_markup| will be set to `true`. Otherwise, it will be +// left untouched. +COMPONENT_EXPORT(URL) +const char* RemoveURLWhitespace(const char* input, + int input_len, + CanonOutputT* buffer, + int* output_len, + bool* potentially_dangling_markup); +COMPONENT_EXPORT(URL) +const char16_t* RemoveURLWhitespace(const char16_t* input, + int input_len, + CanonOutputT* buffer, + int* output_len, + bool* potentially_dangling_markup); + +// IDN ------------------------------------------------------------------------ + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must fall in the ASCII range, but will be encoded in UTF-16. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, returns false. The output in this case is undefined. +COMPONENT_EXPORT(URL) +bool IDNToASCII(const char16_t* src, int src_len, CanonOutputW* output); + +// Piece-by-piece canonicalizers ---------------------------------------------- +// +// These individual canonicalizers append the canonicalized versions of the +// corresponding URL component to the given CanonOutput. The spec and the +// previously-identified range of that component are the input. The range of +// the canonicalized component will be written to the output component. +// +// These functions all append to the output so they can be chained. Make sure +// the output is empty when you start. +// +// These functions returns boolean values indicating success. On failure, they +// will attempt to write something reasonable to the output so that, if +// displayed to the user, they will recognise it as something that's messed up. +// Nothing more should ever be done with these invalid URLs, however. + +// Scheme: Appends the scheme and colon to the URL. The output component will +// indicate the range of characters up to but not including the colon. +// +// Canonical URLs always have a scheme. If the scheme is not present in the +// input, this will just write the colon to indicate an empty scheme. Does not +// append slashes which will be needed before any authority components for most +// URLs. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool CanonicalizeScheme(const char* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme); +COMPONENT_EXPORT(URL) +bool CanonicalizeScheme(const char16_t* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme); + +// User info: username/password. If present, this will add the delimiters so +// the output will be ":@" or "@". Empty +// username/password pairs, or empty passwords, will get converted to +// nonexistent in the canonical version. +// +// The components for the username and password refer to ranges in the +// respective source strings. Usually, these will be the same string, which +// is legal as long as the two components don't overlap. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool CanonicalizeUserInfo(const char* username_source, + const Component& username, + const char* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password); +COMPONENT_EXPORT(URL) +bool CanonicalizeUserInfo(const char16_t* username_source, + const Component& username, + const char16_t* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password); + +// This structure holds detailed state exported from the IP/Host canonicalizers. +// Additional fields may be added as callers require them. +struct CanonHostInfo { + CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} + + // Convenience function to test if family is an IP address. + bool IsIPAddress() const { return family == IPV4 || family == IPV6; } + + // This field summarizes how the input was classified by the canonicalizer. + enum Family { + NEUTRAL, // - Doesn't resemble an IP address. As far as the IP + // canonicalizer is concerned, it should be treated as a + // hostname. + BROKEN, // - Almost an IP, but was not canonicalized. This could be an + // IPv4 address where truncation occurred, or something + // containing the special characters :[] which did not parse + // as an IPv6 address. Never attempt to connect to this + // address, because it might actually succeed! + IPV4, // - Successfully canonicalized as an IPv4 address. + IPV6, // - Successfully canonicalized as an IPv6 address. + }; + Family family; + + // If |family| is IPV4, then this is the number of nonempty dot-separated + // components in the input text, from 1 to 4. If |family| is not IPV4, + // this value is undefined. + int num_ipv4_components; + + // Location of host within the canonicalized output. + // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. + // CanonicalizeHostVerbose() always sets it. + Component out_host; + + // |address| contains the parsed IP Address (if any) in its first + // AddressLength() bytes, in network order. If IsIPAddress() is false + // AddressLength() will return zero and the content of |address| is undefined. + unsigned char address[16]; + + // Convenience function to calculate the length of an IP address corresponding + // to the current IP version in |family|, if any. For use with |address|. + int AddressLength() const { + return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); + } +}; + +// Host. +// +// The 8-bit version requires UTF-8 encoding. Use this version when you only +// need to know whether canonicalization succeeded. +COMPONENT_EXPORT(URL) +bool CanonicalizeHost(const char* spec, + const Component& host, + CanonOutput* output, + Component* out_host); +COMPONENT_EXPORT(URL) +bool CanonicalizeHost(const char16_t* spec, + const Component& host, + CanonOutput* output, + Component* out_host); + +// Extended version of CanonicalizeHost, which returns additional information. +// Use this when you need to know whether the hostname was an IP address. +// A successful return is indicated by host_info->family != BROKEN. See the +// definition of CanonHostInfo above for details. +COMPONENT_EXPORT(URL) +void CanonicalizeHostVerbose(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +COMPONENT_EXPORT(URL) +void CanonicalizeHostVerbose(const char16_t* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Canonicalizes a string according to the host canonicalization rules. Unlike +// CanonicalizeHost, this will not check for IP addresses which can change the +// meaning (and canonicalization) of the components. This means it is possible +// to call this for sub-components of a host name without corruption. +// +// As an example, "01.02.03.04.com" is a canonical hostname. If you called +// CanonicalizeHost on the substring "01.02.03.04" it will get "fixed" to +// "1.2.3.4" which will produce an invalid host name when reassembled. This +// can happen more than one might think because all numbers by themselves are +// considered IP addresses; so "5" canonicalizes to "0.0.0.5". +// +// Be careful: Because Punycode works on each dot-separated substring as a +// unit, you should only pass this function substrings that represent complete +// dot-separated subcomponents of the original host. Even if you have ASCII +// input, percent-escaped characters will have different meanings if split in +// the middle. +// +// Returns true if the host was valid. This function will treat a 0-length +// host as valid (because it's designed to be used for substrings) while the +// full version above will mark empty hosts as broken. +COMPONENT_EXPORT(URL) +bool CanonicalizeHostSubstring(const char* spec, + const Component& host, + CanonOutput* output); +COMPONENT_EXPORT(URL) +bool CanonicalizeHostSubstring(const char16_t* spec, + const Component& host, + CanonOutput* output); + +// IP addresses. +// +// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is +// an IP address, it will canonicalize it as such, appending it to |output|. +// Additional status information is returned via the |*host_info| parameter. +// See the definition of CanonHostInfo above for details. +// +// This is called AUTOMATICALLY from the host canonicalizer, which ensures that +// the input is unescaped and name-prepped, etc. It should not normally be +// necessary or wise to call this directly. +COMPONENT_EXPORT(URL) +void CanonicalizeIPAddress(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +COMPONENT_EXPORT(URL) +void CanonicalizeIPAddress(const char16_t* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Port: this function will add the colon for the port if a port is present. +// The caller can pass PORT_UNSPECIFIED as the +// default_port_for_scheme argument if there is no default port. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool CanonicalizePort(const char* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port); +COMPONENT_EXPORT(URL) +bool CanonicalizePort(const char16_t* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port); + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. Based on https://url.spec.whatwg.org/#default-port +COMPONENT_EXPORT(URL) +int DefaultPortForScheme(const char* scheme, int scheme_len); + +// Path. If the input does not begin in a slash (including if the input is +// empty), we'll prepend a slash to the path to make it canonical. +// +// The 8-bit version assumes UTF-8 encoding, but does not verify the validity +// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid +// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't +// an issue. Somebody giving us an 8-bit path is responsible for generating +// the path that the server expects (we'll escape high-bit characters), so +// if something is invalid, it's their problem. +COMPONENT_EXPORT(URL) +bool CanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path); +COMPONENT_EXPORT(URL) +bool CanonicalizePath(const char16_t* spec, + const Component& path, + CanonOutput* output, + Component* out_path); + +// Like CanonicalizePath(), but does not assume that its operating on the +// entire path. It therefore does not prepend a slash, etc. +COMPONENT_EXPORT(URL) +bool CanonicalizePartialPath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path); +COMPONENT_EXPORT(URL) +bool CanonicalizePartialPath(const char16_t* spec, + const Component& path, + CanonOutput* output, + Component* out_path); + +// Canonicalizes the input as a file path. This is like CanonicalizePath except +// that it also handles Windows drive specs. For example, the path can begin +// with "c|\" and it will get properly canonicalized to "C:/". +// The string will be appended to |*output| and |*out_path| will be updated. +// +// The 8-bit version requires UTF-8 encoding. +COMPONENT_EXPORT(URL) +bool FileCanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path); +COMPONENT_EXPORT(URL) +bool FileCanonicalizePath(const char16_t* spec, + const Component& path, + CanonOutput* output, + Component* out_path); + +// Query: Prepends the ? if needed. +// +// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly +// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode +// "invalid character." This function can not fail, we always just try to do +// our best for crazy input here since web pages can set it themselves. +// +// This will convert the given input into the output encoding that the given +// character set converter object provides. The converter will only be called +// if necessary, for ASCII input, no conversions are necessary. +// +// The converter can be NULL. In this case, the output encoding will be UTF-8. +COMPONENT_EXPORT(URL) +void CanonicalizeQuery(const char* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query); +COMPONENT_EXPORT(URL) +void CanonicalizeQuery(const char16_t* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query); + +// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only +// canonicalizer that does not produce ASCII output). The output is +// guaranteed to be valid UTF-8. +// +// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use +// the "Unicode replacement character" for the confusing bits and copy the rest. +COMPONENT_EXPORT(URL) +void CanonicalizeRef(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path); +COMPONENT_EXPORT(URL) +void CanonicalizeRef(const char16_t* spec, + const Component& path, + CanonOutput* output, + Component* out_path); + +// Full canonicalizer --------------------------------------------------------- +// +// These functions replace any string contents, rather than append as above. +// See the above piece-by-piece functions for information specific to +// canonicalizing individual components. +// +// The output will be ASCII except the reference fragment, which may be UTF-8. +// +// The 8-bit versions require UTF-8 encoding. + +// Use for standard URLs with authorities and paths. +COMPONENT_EXPORT(URL) +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeStandardURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Use for file URLs. +COMPONENT_EXPORT(URL) +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeFileURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Use for filesystem URLs. +COMPONENT_EXPORT(URL) +bool CanonicalizeFileSystemURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeFileSystemURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Use for path URLs such as javascript. This does not modify the path in any +// way, for example, by escaping it. +COMPONENT_EXPORT(URL) +bool CanonicalizePathURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizePathURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); + +// Use to canonicalize just the path component of a "path" URL; e.g. the +// path of a javascript URL. +COMPONENT_EXPORT(URL) +void CanonicalizePathURLPath(const char* source, + const Component& component, + CanonOutput* output, + Component* new_component); +COMPONENT_EXPORT(URL) +void CanonicalizePathURLPath(const char16_t* source, + const Component& component, + CanonOutput* output, + Component* new_component); + +// Use for mailto URLs. This "canonicalizes" the URL into a path and query +// component. It does not attempt to merge "to" fields. It uses UTF-8 for +// the query encoding if there is a query. This is because a mailto URL is +// really intended for an external mail program, and the encoding of a page, +// etc. which would influence a query encoding normally are irrelevant. +COMPONENT_EXPORT(URL) +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool CanonicalizeMailtoURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed); + +// Part replacer -------------------------------------------------------------- + +// Internal structure used for storing separate strings for each component. +// The basic canonicalization functions use this structure internally so that +// component replacement (different strings for different components) can be +// treated on the same code path as regular canonicalization (the same string +// for each component). +// +// A Parsed structure usually goes along with this. Those components identify +// offsets within these strings, so that they can all be in the same string, +// or spread arbitrarily across different ones. +// +// This structures does not own any data. It is the caller's responsibility to +// ensure that the data the pointers point to stays in scope and is not +// modified. +template +struct URLComponentSource { + // Constructor normally used by callers wishing to replace components. This + // will make them all NULL, which is no replacement. The caller would then + // override the components they want to replace. + URLComponentSource() + : scheme(nullptr), + username(nullptr), + password(nullptr), + host(nullptr), + port(nullptr), + path(nullptr), + query(nullptr), + ref(nullptr) {} + + // Constructor normally used internally to initialize all the components to + // point to the same spec. + explicit URLComponentSource(const CHAR* default_value) + : scheme(default_value), + username(default_value), + password(default_value), + host(default_value), + port(default_value), + path(default_value), + query(default_value), + ref(default_value) {} + + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* scheme; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* username; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* password; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* host; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* port; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* path; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* query; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const CHAR* ref; +}; + +// This structure encapsulates information on modifying a URL. Each component +// may either be left unchanged, replaced, or deleted. +// +// By default, each component is unchanged. For those components that should be +// modified, call either Set* or Clear* to modify it. +// +// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT +// IN SCOPE BY THE CALLER for as long as this object exists! +// +// Prefer the 8-bit replacement version if possible since it is more efficient. +template +class Replacements { + public: + Replacements() {} + + // Scheme + void SetScheme(const CHAR* s, const Component& comp) { + sources_.scheme = s; + components_.scheme = comp; + } + // Note: we don't have a ClearScheme since this doesn't make any sense. + bool IsSchemeOverridden() const { return sources_.scheme != NULL; } + + // Username + void SetUsername(const CHAR* s, const Component& comp) { + sources_.username = s; + components_.username = comp; + } + void ClearUsername() { + sources_.username = Placeholder(); + components_.username = Component(); + } + bool IsUsernameOverridden() const { return sources_.username != NULL; } + + // Password + void SetPassword(const CHAR* s, const Component& comp) { + sources_.password = s; + components_.password = comp; + } + void ClearPassword() { + sources_.password = Placeholder(); + components_.password = Component(); + } + bool IsPasswordOverridden() const { return sources_.password != NULL; } + + // Host + void SetHost(const CHAR* s, const Component& comp) { + sources_.host = s; + components_.host = comp; + } + void ClearHost() { + sources_.host = Placeholder(); + components_.host = Component(); + } + bool IsHostOverridden() const { return sources_.host != NULL; } + + // Port + void SetPort(const CHAR* s, const Component& comp) { + sources_.port = s; + components_.port = comp; + } + void ClearPort() { + sources_.port = Placeholder(); + components_.port = Component(); + } + bool IsPortOverridden() const { return sources_.port != NULL; } + + // Path + void SetPath(const CHAR* s, const Component& comp) { + sources_.path = s; + components_.path = comp; + } + void ClearPath() { + sources_.path = Placeholder(); + components_.path = Component(); + } + bool IsPathOverridden() const { return sources_.path != NULL; } + + // Query + void SetQuery(const CHAR* s, const Component& comp) { + sources_.query = s; + components_.query = comp; + } + void ClearQuery() { + sources_.query = Placeholder(); + components_.query = Component(); + } + bool IsQueryOverridden() const { return sources_.query != NULL; } + + // Ref + void SetRef(const CHAR* s, const Component& comp) { + sources_.ref = s; + components_.ref = comp; + } + void ClearRef() { + sources_.ref = Placeholder(); + components_.ref = Component(); + } + bool IsRefOverridden() const { return sources_.ref != NULL; } + + // Getters for the internal data. See the variables below for how the + // information is encoded. + const URLComponentSource& sources() const { return sources_; } + const Parsed& components() const { return components_; } + + private: + // Returns a pointer to a static empty string that is used as a placeholder + // to indicate a component should be deleted (see below). + const CHAR* Placeholder() { + static const CHAR empty_cstr = 0; + return &empty_cstr; + } + + // We support three states: + // + // Action | Source Component + // -----------------------+-------------------------------------------------- + // Don't change component | NULL (unused) + // Replace component | (replacement string) (replacement component) + // Delete component | (non-NULL) (invalid component: (0,-1)) + // + // We use a pointer to the empty string for the source when the component + // should be deleted. + URLComponentSource sources_; + Parsed components_; +}; + +// The base must be an 8-bit canonical URL. +COMPONENT_EXPORT(URL) +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Filesystem URLs can only have the path, query, or ref replaced. +// All other components will be ignored. +COMPONENT_EXPORT(URL) +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Replacing some parts of a file URL is not permitted. Everything except +// the host, path, query, and ref will be ignored. +COMPONENT_EXPORT(URL) +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed); + +// Path URLs can only have the scheme and path replaced. All other components +// will be ignored. +COMPONENT_EXPORT(URL) +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed); + +// Mailto URLs can only have the scheme, path, and query replaced. +// All other components will be ignored. +COMPONENT_EXPORT(URL) +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed); + +// Relative URL --------------------------------------------------------------- + +// Given an input URL or URL fragment |fragment|, determines if it is a +// relative or absolute URL and places the result into |*is_relative|. If it is +// relative, the relevant portion of the URL will be placed into +// |*relative_component| (there may have been trimmed whitespace, for example). +// This value is passed to ResolveRelativeURL. If the input is not relative, +// this value is UNDEFINED (it may be changed by the function). +// +// Returns true on success (we successfully determined the URL is relative or +// not). Failure means that the combination of URLs doesn't make any sense. +// +// The base URL should always be canonical, therefore is ASCII. +COMPONENT_EXPORT(URL) +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component); +COMPONENT_EXPORT(URL) +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const char16_t* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component); + +// Given a canonical parsed source URL, a URL fragment known to be relative, +// and the identified relevant portion of the relative URL (computed by +// IsRelativeURL), this produces a new parsed canonical URL in |output| and +// |out_parsed|. +// +// It also requires a flag indicating whether the base URL is a file: URL +// which triggers additional logic. +// +// The base URL should be canonical and have a host (may be empty for file +// URLs) and a path. If it doesn't have these, we can't resolve relative +// URLs off of it and will return the base as the output with an error flag. +// Because it is canonical is should also be ASCII. +// +// The query charset converter follows the same rules as CanonicalizeQuery. +// +// Returns true on success. On failure, the output will be "something +// reasonable" that will be consistent and valid, just probably not what +// was intended by the web page author or caller. +COMPONENT_EXPORT(URL) +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed); +COMPONENT_EXPORT(URL) +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const char16_t* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed); + +} // namespace url + +#endif // URL_URL_CANON_H_ diff --git a/url_canon_etc.cc b/url_canon_etc.cc new file mode 100644 index 00000000000..3d1cb938edb --- /dev/null +++ b/url_canon_etc.cc @@ -0,0 +1,428 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Canonicalizers for random bits that aren't big enough for their own files. + +#include + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace { + +// Returns true if the given character should be removed from the middle of a +// URL. +inline bool IsRemovableURLWhitespace(int ch) { + return ch == '\r' || ch == '\n' || ch == '\t'; +} + +// Backend for RemoveURLWhitespace (see declaration in url_canon.h). +// It sucks that we have to do this, since this takes about 13% of the total URL +// canonicalization time. +template +const CHAR* DoRemoveURLWhitespace(const CHAR* input, + int input_len, + CanonOutputT* buffer, + int* output_len, + bool* potentially_dangling_markup) { + // Fast verification that there's nothing that needs removal. This is the 99% + // case, so we want it to be fast and don't care about impacting the speed + // when we do find whitespace. + bool found_whitespace = false; + if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) { + // For large strings, memchr is much faster than any scalar code we can + // write, even if we need to run it three times. (If this turns out to still + // be a bottleneck, we could write our own vector code, but given that + // memchr is so fast, it's unlikely to be relevant.) + found_whitespace = memchr(input, '\n', input_len) != nullptr || + memchr(input, '\r', input_len) != nullptr || + memchr(input, '\t', input_len) != nullptr; + } else { + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) + continue; + found_whitespace = true; + break; + } + } + + if (!found_whitespace) { + // Didn't find any whitespace, we don't need to do anything. We can just + // return the input as the output. + *output_len = input_len; + return input; + } + + // Skip whitespace removal for `data:` URLs. + // + // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but + // that turns out to be difficult to do correctly given this function's + // character type templating. + if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' && + input[3] == 'a' && input[4] == ':') { + *output_len = input_len; + return input; + } + + // Remove the whitespace into the new buffer and return it. + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) { + if (potentially_dangling_markup && input[i] == 0x3C) + *potentially_dangling_markup = true; + buffer->push_back(input[i]); + } + } + *output_len = buffer->length(); + return buffer->data(); +} + +// Contains the canonical version of each possible input letter in the scheme +// (basically, lower-cased). The corresponding entry will be 0 if the letter +// is not allowed in a scheme. +// clang-format off +const char kSchemeCanonical[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 , +// @ A B C D E F G H I J K L M N O + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0, +// ` a b c d e f g h i j k l m n o + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 }; +// clang-format on + +// This could be a table lookup as well by setting the high bit for each +// valid character, but it's only called once per URL, and it makes the lookup +// table easier to read not having extra stuff in it. +inline bool IsSchemeFirstChar(unsigned char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +template +bool DoScheme(const CHAR* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme) { + if (scheme.is_empty()) { + // Scheme is unspecified or empty, convert to empty by appending a colon. + *out_scheme = Component(output->length(), 0); + output->push_back(':'); + return false; + } + + // The output scheme starts from the current position. + out_scheme->begin = output->length(); + + // Danger: it's important that this code does not strip any characters; + // it only emits the canonical version (be it valid or escaped) for each + // of the input characters. Stripping would put it out of sync with + // FindAndCompareScheme, which could cause some security checks on + // schemes to be incorrect. + bool success = true; + size_t begin = static_cast(scheme.begin); + size_t end = static_cast(scheme.end()); + for (size_t i = begin; i < end; i++) { + UCHAR ch = static_cast(spec[i]); + char replacement = 0; + if (ch < 0x80) { + if (i == begin) { + // Need to do a special check for the first letter of the scheme. + if (IsSchemeFirstChar(static_cast(ch))) + replacement = kSchemeCanonical[ch]; + } else { + replacement = kSchemeCanonical[ch]; + } + } + + if (replacement) { + output->push_back(replacement); + } else if (ch == '%') { + // Canonicalizing the scheme multiple times should lead to the same + // result. Since invalid characters will be escaped, we need to preserve + // the percent to avoid multiple escaping. The scheme will be invalid. + success = false; + output->push_back('%'); + } else { + // Invalid character, store it but mark this scheme as invalid. + success = false; + + // This will escape the output and also handle encoding issues. + // Ignore the return value since we already failed. + AppendUTF8EscapedChar(spec, &i, end, output); + } + } + + // The output scheme ends with the the current position, before appending + // the colon. + out_scheme->len = output->length() - out_scheme->begin; + output->push_back(':'); + return success; +} + +// The username and password components reference ranges in the corresponding +// *_spec strings. Typically, these specs will be the same (we're +// canonicalizing a single source string), but may be different when +// replacing components. +template +bool DoUserInfo(const CHAR* username_spec, + const Component& username, + const CHAR* password_spec, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password) { + if (username.is_empty() && password.is_empty()) { + // Common case: no user info. We strip empty username/passwords. + *out_username = Component(); + *out_password = Component(); + return true; + } + + // Write the username. + out_username->begin = output->length(); + if (username.is_nonempty()) { + // This will escape characters not valid for the username. + AppendStringOfType(&username_spec[username.begin], + static_cast(username.len), CHAR_USERINFO, + output); + } + out_username->len = output->length() - out_username->begin; + + // When there is a password, we need the separator. Note that we strip + // empty but specified passwords. + if (password.is_nonempty()) { + output->push_back(':'); + out_password->begin = output->length(); + AppendStringOfType(&password_spec[password.begin], + static_cast(password.len), CHAR_USERINFO, + output); + out_password->len = output->length() - out_password->begin; + } else { + *out_password = Component(); + } + + output->push_back('@'); + return true; +} + +// Helper functions for converting port integers to strings. +inline void WritePortInt(char* output, int output_len, int port) { + _itoa_s(port, output, output_len, 10); +} + +// This function will prepend the colon if there will be a port. +template +bool DoPort(const CHAR* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port) { + int port_num = ParsePort(spec, port); + if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) { + *out_port = Component(); + return true; // Leave port empty. + } + + if (port_num == PORT_INVALID) { + // Invalid port: We'll copy the text from the input so the user can see + // what the error was, and mark the URL as invalid by returning false. + output->push_back(':'); + out_port->begin = output->length(); + AppendInvalidNarrowString(spec, static_cast(port.begin), + static_cast(port.end()), output); + out_port->len = output->length() - out_port->begin; + return false; + } + + // Convert port number back to an integer. Max port value is 5 digits, and + // the Parsed::ExtractPort will have made sure the integer is in range. + const int buf_size = 6; + char buf[buf_size]; + WritePortInt(buf, buf_size, port_num); + + // Append the port number to the output, preceded by a colon. + output->push_back(':'); + out_port->begin = output->length(); + for (int i = 0; i < buf_size && buf[i]; i++) + output->push_back(buf[i]); + + out_port->len = output->length() - out_port->begin; + return true; +} + +// clang-format off +// Percent-escape all characters from the fragment percent-encode set +// https://url.spec.whatwg.org/#fragment-percent-encode-set +const bool kShouldEscapeCharInFragment[0x80] = { +// Control characters (0x00-0x1F) + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, +// ' ' ! " # $ % & ' + true, false, true, false, false, false, false, false, +// ( ) * + , - . / + false, false, false, false, false, false, false, false, +// 0 1 2 3 4 5 6 7 + false, false, false, false, false, false, false, false, +// 8 9 : ; < = > ? + false, false, false, false, true, false, true, false, +// @ A B C D E F G + false, false, false, false, false, false, false, false, +// H I J K L M N O + false, false, false, false, false, false, false, false, +// P Q R S T U V W + false, false, false, false, false, false, false, false, +// X Y Z [ \ ] ^ _ + false, false, false, false, false, false, false, false, +// ` a b c d e f g + true, false, false, false, false, false, false, false, +// h i j k l m n o + false, false, false, false, false, false, false, false, +// p q r s t u v w + false, false, false, false, false, false, false, false, +// x y z { | } ~ DELETE + false, false, false, false, false, false, false, true +}; +// clang-format on + +template +void DoCanonicalizeRef(const CHAR* spec, + const Component& ref, + CanonOutput* output, + Component* out_ref) { + if (!ref.is_valid()) { + // Common case of no ref. + *out_ref = Component(); + return; + } + + // Append the ref separator. Note that we need to do this even when the ref + // is empty but present. + output->push_back('#'); + out_ref->begin = output->length(); + + // Now iterate through all the characters, converting to UTF-8 and validating. + size_t end = static_cast(ref.end()); + for (size_t i = static_cast(ref.begin); i < end; i++) { + UCHAR current_char = static_cast(spec[i]); + if (current_char < 0x80) { + if (kShouldEscapeCharInFragment[current_char]) + AppendEscapedChar(static_cast(spec[i]), output); + else + output->push_back(static_cast(spec[i])); + } else { + AppendUTF8EscapedChar(spec, &i, end, output); + } + } + + out_ref->len = output->length() - out_ref->begin; +} + +} // namespace + +const char* RemoveURLWhitespace(const char* input, + int input_len, + CanonOutputT* buffer, + int* output_len, + bool* potentially_dangling_markup) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len, + potentially_dangling_markup); +} + +const char16_t* RemoveURLWhitespace(const char16_t* input, + int input_len, + CanonOutputT* buffer, + int* output_len, + bool* potentially_dangling_markup) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len, + potentially_dangling_markup); +} + +char CanonicalSchemeChar(char16_t ch) { + if (ch >= 0x80) + return 0; // Non-ASCII is not supported by schemes. + return kSchemeCanonical[ch]; +} + +bool CanonicalizeScheme(const char* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme) { + return DoScheme(spec, scheme, output, out_scheme); +} + +bool CanonicalizeScheme(const char16_t* spec, + const Component& scheme, + CanonOutput* output, + Component* out_scheme) { + return DoScheme(spec, scheme, output, out_scheme); +} + +bool CanonicalizeUserInfo(const char* username_source, + const Component& username, + const char* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password) { + return DoUserInfo(username_source, username, + password_source, password, output, + out_username, out_password); +} + +bool CanonicalizeUserInfo(const char16_t* username_source, + const Component& username, + const char16_t* password_source, + const Component& password, + CanonOutput* output, + Component* out_username, + Component* out_password) { + return DoUserInfo(username_source, username, + password_source, password, output, + out_username, out_password); +} + +bool CanonicalizePort(const char* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port) { + return DoPort(spec, port, default_port_for_scheme, + output, out_port); +} + +bool CanonicalizePort(const char16_t* spec, + const Component& port, + int default_port_for_scheme, + CanonOutput* output, + Component* out_port) { + return DoPort(spec, port, default_port_for_scheme, output, + out_port); +} + +void CanonicalizeRef(const char* spec, + const Component& ref, + CanonOutput* output, + Component* out_ref) { + DoCanonicalizeRef(spec, ref, output, out_ref); +} + +void CanonicalizeRef(const char16_t* spec, + const Component& ref, + CanonOutput* output, + Component* out_ref) { + DoCanonicalizeRef(spec, ref, output, out_ref); +} + +} // namespace url diff --git a/url_canon_filesystemurl.cc b/url_canon_filesystemurl.cc new file mode 100644 index 00000000000..0472484de7a --- /dev/null +++ b/url_canon_filesystemurl.cc @@ -0,0 +1,135 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "filesystem:file:" URLs. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" +#include "url/url_util.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// We use the URLComponentSource for the outer URL, as it can have replacements, +// whereas the inner_url can't, so it uses spec. +template +bool DoCanonicalizeFileSystemURL(const CHAR* spec, + const URLComponentSource& source, + const Parsed& parsed, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + // filesystem only uses {scheme, path, query, ref} -- clear the rest. + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->host.reset(); + new_parsed->port.reset(); + + const Parsed* inner_parsed = parsed.inner_parsed(); + Parsed new_inner_parsed; + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("filesystem:", 11); + new_parsed->scheme.len = 10; + + if (!inner_parsed || !inner_parsed->scheme.is_valid()) + return false; + + bool success = true; + SchemeType inner_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (CompareSchemeComponent(spec, inner_parsed->scheme, url::kFileScheme)) { + new_inner_parsed.scheme.begin = output->length(); + output->Append("file://", 7); + new_inner_parsed.scheme.len = 4; + success &= CanonicalizePath(spec, inner_parsed->path, output, + &new_inner_parsed.path); + } else if (GetStandardSchemeType(spec, inner_parsed->scheme, + &inner_scheme_type)) { + if (inner_scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION) { + // Strip out the user information from the inner URL, if any. + inner_scheme_type = SCHEME_WITH_HOST_AND_PORT; + } + success = CanonicalizeStandardURL( + spec, inner_parsed->Length(), *inner_parsed, inner_scheme_type, + charset_converter, output, &new_inner_parsed); + } else { + // TODO(ericu): The URL is wrong, but should we try to output more of what + // we were given? Echoing back filesystem:mailto etc. doesn't seem all that + // useful. + return false; + } + // The filesystem type must be more than just a leading slash for validity. + success &= new_inner_parsed.path.len > 1; + + success &= CanonicalizePath(source.path, parsed.path, output, + &new_parsed->path); + + // Ignore failures for query/ref since the URL can probably still be loaded. + CanonicalizeQuery(source.query, parsed.query, charset_converter, + output, &new_parsed->query); + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + if (success) + new_parsed->set_inner_parsed(new_inner_parsed); + + return success; +} + +} // namespace + +bool CanonicalizeFileSystemURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileSystemURL( + spec, URLComponentSource(spec), parsed, charset_converter, output, + new_parsed); +} + +bool CanonicalizeFileSystemURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileSystemURL( + spec, URLComponentSource(spec), parsed, charset_converter, + output, new_parsed); +} + +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeFileSystemURL( + base, source, parsed, charset_converter, output, new_parsed); +} + +bool ReplaceFileSystemURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeFileSystemURL( + base, source, parsed, charset_converter, output, new_parsed); +} + +} // namespace url diff --git a/url_canon_fileurl.cc b/url_canon_fileurl.cc new file mode 100644 index 00000000000..b45114d77fd --- /dev/null +++ b/url_canon_fileurl.cc @@ -0,0 +1,247 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "file:" URLs. + +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +namespace url { + +namespace { + +bool IsLocalhost(const char* spec, int begin, int end) { + if (begin > end) + return false; + return base::StringPiece(&spec[begin], end - begin) == "localhost"; +} + +bool IsLocalhost(const char16_t* spec, int begin, int end) { + if (begin > end) + return false; + return base::StringPiece16(&spec[begin], end - begin) == u"localhost"; +} + +template +int DoFindWindowsDriveLetter(const CHAR* spec, int begin, int end) { + if (begin > end) + return -1; + + // First guess the beginning of the drive letter. + // If there is something that looks like a drive letter in the spec between + // begin and end, store its position in drive_letter_pos. + int drive_letter_pos = + DoesContainWindowsDriveSpecUntil(spec, begin, end, end); + if (drive_letter_pos < begin) + return -1; + + // Check if the path up to the drive letter candidate can be canonicalized as + // "/". + Component sub_path = MakeRange(begin, drive_letter_pos); + RawCanonOutput<1024> output; + Component output_path; + bool success = CanonicalizePath(spec, sub_path, &output, &output_path); + if (!success || output_path.len != 1 || output.at(output_path.begin) != '/') { + return -1; + } + + return drive_letter_pos; +} + +#ifdef WIN32 + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template +int FileDoDriveSpec(const CHAR* spec, int begin, int end, CanonOutput* output) { + int drive_letter_pos = FindWindowsDriveLetter(spec, begin, end); + if (drive_letter_pos < begin) + return begin; + + // By now, a valid drive letter is confirmed at position drive_letter_pos, + // followed by a valid drive letter separator (a colon or a pipe). + + output->push_back('/'); + + // Normalize Windows drive letters to uppercase. + if (base::IsAsciiLower(spec[drive_letter_pos])) + output->push_back(static_cast(spec[drive_letter_pos] - 'a' + 'A')); + else + output->push_back(static_cast(spec[drive_letter_pos])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + return drive_letter_pos + 2; +} + +#endif // WIN32 + +template +bool DoFileCanonicalizePath(const CHAR* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + // Copies and normalizes the "c:" at the beginning, if present. + out_path->begin = output->length(); + int after_drive; +#ifdef WIN32 + after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output); +#else + after_drive = path.begin; +#endif + + // Copies the rest of the path, starting from the slash following the + // drive colon (if any, Windows only), or the first slash of the path. + bool success = true; + if (after_drive < path.end()) { + // Use the regular path canonicalizer to canonicalize the rest of the path + // after the drive. + // + // Give it a fake output component to write into, since we will be + // calculating the out_path ourselves (consisting of both the drive and the + // path we canonicalize here). + Component sub_path = MakeRange(after_drive, path.end()); + Component fake_output_path; + success = CanonicalizePath(spec, sub_path, output, &fake_output_path); + } else if (after_drive == path.begin) { + // No input path and no drive spec, canonicalize to a slash. + output->push_back('/'); + } + + out_path->len = output->length() - out_path->begin; + return success; +} + +template +bool DoCanonicalizeFileURL(const URLComponentSource& source, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = Component(); + new_parsed->password = Component(); + new_parsed->port = Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("file://", 7); + new_parsed->scheme.len = 4; + + // If the host is localhost, and the path starts with a Windows drive letter, + // remove the host component. This does the following transformation: + // file://localhost/C:/hello.txt -> file:///C:/hello.txt + // + // Note: we do this on every platform per URL Standard, not just Windows. + // + // TODO(https://crbug.com/688961): According to the latest URL spec, this + // transformation should be done regardless of the path. + Component host_range = parsed.host; + if (IsLocalhost(source.host, host_range.begin, host_range.end()) && + FindWindowsDriveLetter(source.path, parsed.path.begin, + parsed.path.end()) >= parsed.path.begin) { + host_range.reset(); + } + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = + CanonicalizeHost(source.host, host_range, output, &new_parsed->host); + success &= DoFileCanonicalizePath(source.path, parsed.path, + output, &new_parsed->path); + + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + +int FindWindowsDriveLetter(const char* spec, int begin, int end) { + return DoFindWindowsDriveLetter(spec, begin, end); +} + +int FindWindowsDriveLetter(const char16_t* spec, int begin, int end) { + return DoFindWindowsDriveLetter(spec, begin, end); +} + +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileURL( + URLComponentSource(spec), parsed, query_converter, + output, new_parsed); +} + +bool CanonicalizeFileURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeFileURL( + URLComponentSource(spec), parsed, query_converter, output, + new_parsed); +} + +bool FileCanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoFileCanonicalizePath(spec, path, + output, out_path); +} + +bool FileCanonicalizePath(const char16_t* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoFileCanonicalizePath(spec, path, output, + out_path); +} + +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeFileURL( + source, parsed, query_converter, output, new_parsed); +} + +bool ReplaceFileURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeFileURL( + source, parsed, query_converter, output, new_parsed); +} + +} // namespace url diff --git a/url_canon_host.cc b/url_canon_host.cc new file mode 100644 index 00000000000..d3b1222f17c --- /dev/null +++ b/url_canon_host.cc @@ -0,0 +1,442 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/check.h" +#include "base/cpu_reduction_experiment.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace { + +// For reference, here's what IE supports: +// Key: 0 (disallowed: failure if present in the input) +// + (allowed either escaped or unescaped, and unmodified) +// U (allowed escaped or unescaped but always unescaped if present in +// escaped form) +// E (allowed escaped or unescaped but always escaped if present in +// unescaped form) +// % (only allowed escaped in the input, will be unmodified). +// I left blank alpha numeric characters. +// +// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f +// ----------------------------------------------- +// 0 0 E E E E E E E E E E E E E E E +// 1 E E E E E E E E E E E E E E E E +// 2 E + E E + E + + + + + + + U U 0 +// 3 % % E + E 0 <-- Those are : ; < = > ? +// 4 % +// 5 U 0 U U U <-- Those are [ \ ] ^ _ +// 6 E <-- That's ` +// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE) +// +// NOTE: I didn't actually test all the control characters. Some may be +// disallowed in the input, but they are all accepted escaped except for 0. +// I also didn't test if characters affecting HTML parsing are allowed +// unescaped, e.g. (") or (#), which would indicate the beginning of the path. +// Surprisingly, space is accepted in the input and always escaped. + +// This table lists the canonical version of all characters we allow in the +// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar +// value to indicate that this character should be escaped. We are a little more +// restrictive than IE, but less restrictive than Firefox. +// +// Note that we disallow the % character. We will allow it when part of an +// escape sequence, of course, but this disallows "%25". Even though IE allows +// it, allowing it would put us in a funny state. If there was an invalid +// escape sequence like "%zz", we'll add "%25zz" to the output and fail. +// Allowing percents means we'll succeed a second time, so validity would change +// based on how many times you run the canonicalizer. We prefer to always report +// the same vailidity, so reject this. +const unsigned char kEsc = 0xff; +const unsigned char kHostCharLookup[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 , +// @ A B C D E F G H I J K L M N O + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_', +// ` a b c d e f g h i j k l m n o + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; + +// RFC1034 maximum FQDN length. +constexpr size_t kMaxHostLength = 253; + +// Generous padding to account for the fact that UTS#46 normalization can cause +// a long string to actually shrink and fit within the 253 character RFC1034 +// FQDN length limit. Note that this can still be too short for pathological +// cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be +// removed from the input by UTS#46 processing. However, this should be +// sufficient for all normally-encountered, non-abusive hostname strings. +constexpr size_t kMaxHostBufferLength = kMaxHostLength * 5; + +constexpr size_t kTempHostBufferLen = 1024; +using StackBuffer = RawCanonOutputT; +using StackBufferW = RawCanonOutputT; + +// Scans a host name and fills in the output flags according to what we find. +// |has_non_ascii| will be true if there are any non-7-bit characters, and +// |has_escaped| will be true if there is a percent sign. +template +void ScanHostname(const CHAR* spec, + const Component& host, + bool* has_non_ascii, + bool* has_escaped) { + int end = host.end(); + *has_non_ascii = false; + *has_escaped = false; + for (int i = host.begin; i < end; i++) { + if (static_cast(spec[i]) >= 0x80) + *has_non_ascii = true; + else if (spec[i] == '%') + *has_escaped = true; + } +} + +// Canonicalizes a host name that is entirely 8-bit characters (even though +// the type holding them may be 16 bits. Escaped characters will be unescaped. +// Non-7-bit characters (for example, UTF-8) will be passed unchanged. +// +// The |*has_non_ascii| flag will be true if there are non-7-bit characters in +// the output. +// +// This function is used in two situations: +// +// * When the caller knows there is no non-ASCII or percent escaped +// characters. This is what DoHost does. The result will be a completely +// canonicalized host since we know nothing weird can happen (escaped +// characters could be unescaped to non-7-bit, so they have to be treated +// with suspicion at this point). It does not use the |has_non_ascii| flag. +// +// * When the caller has an 8-bit string that may need unescaping. +// DoComplexHost calls us this situation to do unescaping and validation. +// After this, it may do other IDN operations depending on the value of the +// |*has_non_ascii| flag. +// +// The return value indicates if the output is a potentially valid host name. +template +bool DoSimpleHost(const INCHAR* host, + size_t host_len, + CanonOutputT* output, + bool* has_non_ascii) { + *has_non_ascii = false; + + bool success = true; + for (size_t i = 0; i < host_len; ++i) { + unsigned int source = host[i]; + if (source == '%') { + // Unescape first, if possible. + // Source will be used only if decode operation was successful. + if (!DecodeEscaped(host, &i, host_len, + reinterpret_cast(&source))) { + // Invalid escaped character. There is nothing that can make this + // host valid. We append an escaped percent so the URL looks reasonable + // and mark as failed. + AppendEscapedChar('%', output); + success = false; + continue; + } + } + + if (source < 0x80) { + // We have ASCII input, we can use our lookup table. + unsigned char replacement = kHostCharLookup[source]; + if (!replacement) { + // Invalid character, add it as percent-escaped and mark as failed. + AppendEscapedChar(source, output); + success = false; + } else if (replacement == kEsc) { + // This character is valid but should be escaped. + AppendEscapedChar(source, output); + } else { + // Common case, the given character is valid in a hostname, the lookup + // table tells us the canonical representation of that character (lower + // cased). + output->push_back(replacement); + } + } else { + // It's a non-ascii char. Just push it to the output. + // In case where we have char16 input, and char output it's safe to + // cast char16->char only if input string was converted to ASCII. + output->push_back(static_cast(source)); + *has_non_ascii = true; + } + } + return success; +} + +// Canonicalizes a host that requires IDN conversion. Returns true on success +bool DoIDNHost(const char16_t* src, size_t src_len, CanonOutput* output) { + int original_output_len = output->length(); // So we can rewind below. + + // We need to escape URL before doing IDN conversion, since punicode strings + // cannot be escaped after they are created. + RawCanonOutputW url_escaped_host; + bool has_non_ascii; + DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); + if (url_escaped_host.length() > kMaxHostBufferLength) { + AppendInvalidNarrowString(src, 0, src_len, output); + return false; + } + + StackBufferW wide_output; + if (!IDNToASCII(url_escaped_host.data(), + url_escaped_host.length(), + &wide_output)) { + // Some error, give up. This will write some reasonable looking + // representation of the string to the output. + AppendInvalidNarrowString(src, 0, src_len, output); + return false; + } + + // Now we check the ASCII output like a normal host. It will also handle + // unescaping. Although we unescaped everything before this function call, if + // somebody does %00 as fullwidth, ICU will convert this to ASCII. + bool success = DoSimpleHost(wide_output.data(), wide_output.length(), output, + &has_non_ascii); + if (has_non_ascii) { + // ICU generated something that DoSimpleHost didn't think looked like + // ASCII. This is quite rare, but ICU might convert some characters to + // percent signs which might generate new escape sequences which might in + // turn be invalid. An example is U+FE6A "small percent" which ICU will + // name prep into an ASCII percent and then we can interpret the following + // characters as escaped characters. + // + // If DoSimpleHost didn't think the output was ASCII, just escape the + // thing we gave ICU and give up. DoSimpleHost will have handled a further + // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates + // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't + // do more (like handle escaped non-ASCII sequences). Handling the escaped + // ASCII isn't strictly necessary, but DoSimpleHost handles this case + // anyway so we handle it/ + output->set_length(original_output_len); + AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(), + output); + return false; + } + return success; +} + +// 8-bit convert host to its ASCII version: this converts the UTF-8 input to +// UTF-16. The has_escaped flag should be set if the input string requires +// unescaping. +bool DoComplexHost(const char* host, + size_t host_len, + bool has_non_ascii, + bool has_escaped, + CanonOutput* output) { + // Save the current position in the output. We may write stuff and rewind it + // below, so we need to know where to rewind to. + size_t begin_length = output->length(); + + // Points to the UTF-8 data we want to convert. This will either be the + // input or the unescaped version written to |*output| if necessary. + const char* utf8_source; + size_t utf8_source_len; + bool are_all_escaped_valid = true; + if (has_escaped) { + // Unescape before converting to UTF-16 for IDN. We write this into the + // output because it most likely does not require IDNization, and we can + // save another huge stack buffer. It will be replaced below if it requires + // IDN. This will also update our non-ASCII flag so we know whether the + // unescaped input requires IDN. + if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { + // Error with some escape sequence. We'll call the current output + // complete. DoSimpleHost will have written some "reasonable" output + // for the invalid escapes, but the output could be non-ASCII and + // needs to go through re-encoding below. + are_all_escaped_valid = false; + } + + // Unescaping may have left us with ASCII input, in which case the + // unescaped version we wrote to output is complete. + if (!has_non_ascii) { + return are_all_escaped_valid; + } + + // Save the pointer into the data was just converted (it may be appended to + // other data in the output buffer). + utf8_source = &output->data()[begin_length]; + utf8_source_len = output->length() - begin_length; + } else { + // We don't need to unescape, use input for IDNization later. (We know the + // input has non-ASCII, or the simple version would have been called + // instead of us.) + utf8_source = host; + utf8_source_len = host_len; + } + + // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion. + // Above, we may have used the output to write the unescaped values to, so + // we have to rewind it to where we started after we convert it to UTF-16. + StackBufferW utf16; + if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) { + // In this error case, the input may or may not be the output. + StackBuffer utf8; + for (size_t i = 0; i < utf8_source_len; i++) + utf8.push_back(utf8_source[i]); + output->set_length(begin_length); + AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output); + return false; + } + output->set_length(begin_length); + + // This will call DoSimpleHost which will do normal ASCII canonicalization + // and also check for IP addresses in the outpt. + return DoIDNHost(utf16.data(), utf16.length(), output) && + are_all_escaped_valid; +} + +// UTF-16 convert host to its ASCII version. The set up is already ready for +// the backend, so we just pass through. The has_escaped flag should be set if +// the input string requires unescaping. +bool DoComplexHost(const char16_t* host, + size_t host_len, + bool has_non_ascii, + bool has_escaped, + CanonOutput* output) { + if (has_escaped) { + // Yikes, we have escaped characters with wide input. The escaped + // characters should be interpreted as UTF-8. To solve this problem, + // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN. + // + // We don't bother to optimize the conversion in the ASCII case (which + // *could* just be a copy) and use the UTF-8 path, because it should be + // very rare that host names have escaped characters, and it is relatively + // fast to do the conversion anyway. + StackBuffer utf8; + if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) { + AppendInvalidNarrowString(host, 0, host_len, output); + return false; + } + + // Once we convert to UTF-8, we can use the 8-bit version of the complex + // host handling code above. + return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, has_escaped, + output); + } + + // No unescaping necessary, we can safely pass the input to ICU. This + // function will only get called if we either have escaped or non-ascii + // input, so it's safe to just use ICU now. Even if the input is ASCII, + // this function will do the right thing (just slower than we could). + return DoIDNHost(host, host_len, output); +} + +template +bool DoHostSubstring(const CHAR* spec, + const Component& host, + CanonOutput* output) { + DCHECK(host.is_valid()); + + bool has_non_ascii, has_escaped; + ScanHostname(spec, host, &has_non_ascii, &has_escaped); + + if (has_non_ascii || has_escaped) { + return DoComplexHost(&spec[host.begin], static_cast(host.len), + has_non_ascii, has_escaped, output); + } + + const bool success = DoSimpleHost( + &spec[host.begin], static_cast(host.len), output, &has_non_ascii); + DCHECK(!has_non_ascii); + return success; +} + +template +void DoHost(const CHAR* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (host.is_empty()) { + // Empty hosts don't need anything. + host_info->family = CanonHostInfo::NEUTRAL; + host_info->out_host = Component(); + return; + } + + // Keep track of output's initial length, so we can rewind later. + const int output_begin = output->length(); + + if (DoHostSubstring(spec, host, output)) { + // After all the other canonicalization, check if we ended up with an IP + // address. IP addresses are small, so writing into this temporary buffer + // should not cause an allocation. + RawCanonOutput<64> canon_ip; + CanonicalizeIPAddress(output->data(), + MakeRange(output_begin, output->length()), + &canon_ip, host_info); + + // If we got an IPv4/IPv6 address, copy the canonical form back to the + // real buffer. Otherwise, it's a hostname or broken IP, in which case + // we just leave it in place. + if (host_info->IsIPAddress()) { + output->set_length(output_begin); + output->Append(canon_ip.data(), canon_ip.length()); + } + } else { + // Canonicalization failed. Set BROKEN to notify the caller. + host_info->family = CanonHostInfo::BROKEN; + } + + host_info->out_host = MakeRange(output_begin, output->length()); +} + +} // namespace + +bool CanonicalizeHost(const char* spec, + const Component& host, + CanonOutput* output, + Component* out_host) { + CanonHostInfo host_info; + DoHost(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +bool CanonicalizeHost(const char16_t* spec, + const Component& host, + CanonOutput* output, + Component* out_host) { + CanonHostInfo host_info; + DoHost(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +void CanonicalizeHostVerbose(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + DoHost(spec, host, output, host_info); +} + +void CanonicalizeHostVerbose(const char16_t* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + DoHost(spec, host, output, host_info); +} + +bool CanonicalizeHostSubstring(const char* spec, + const Component& host, + CanonOutput* output) { + return DoHostSubstring(spec, host, output); +} + +bool CanonicalizeHostSubstring(const char16_t* spec, + const Component& host, + CanonOutput* output) { + return DoHostSubstring(spec, host, output); +} + +} // namespace url diff --git a/url_canon_icu.cc b/url_canon_icu.cc new file mode 100644 index 00000000000..5adc187748d --- /dev/null +++ b/url_canon_icu.cc @@ -0,0 +1,114 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// ICU-based character set converter. + +#include +#include +#include + +#include "base/check.h" +#include "base/memory/raw_ptr.h" +#include "base/memory/raw_ptr_exclusion.h" +#include "third_party/icu/source/common/unicode/ucnv.h" +#include "third_party/icu/source/common/unicode/ucnv_cb.h" +#include "third_party/icu/source/common/unicode/utypes.h" +#include "url/url_canon_icu.h" +#include "url/url_canon_internal.h" // for _itoa_s + +namespace url { + +namespace { + +// Called when converting a character that can not be represented, this will +// append an escaped version of the numerical character reference for that code +// point. It is of the form "Ӓ" and we will escape the non-digits to +// "%26%231234%3B". Why? This is what Netscape did back in the olden days. +void appendURLEscapedChar(const void* context, + UConverterFromUnicodeArgs* from_args, + const UChar* code_units, + int32_t length, + UChar32 code_point, + UConverterCallbackReason reason, + UErrorCode* err) { + if (reason == UCNV_UNASSIGNED) { + *err = U_ZERO_ERROR; + + const static int prefix_len = 6; + const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped + ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err); + + DCHECK(code_point < 0x110000); + char number[8]; // Max Unicode code point is 7 digits. + _itoa_s(code_point, number, 10); + int number_len = static_cast(strlen(number)); + ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err); + + const static int postfix_len = 3; + const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped + ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err); + } +} + +// A class for scoping the installation of the invalid character callback. +class AppendHandlerInstaller { + public: + // The owner of this object must ensure that the converter is alive for the + // duration of this object's lifetime. + AppendHandlerInstaller(UConverter* converter) : converter_(converter) { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0, + &old_callback_, &old_context_, &err); + } + + ~AppendHandlerInstaller() { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err); + } + + private: + raw_ptr converter_; + + UConverterFromUCallback old_callback_; + // This field is not a raw_ptr<> because it was filtered by the rewriter for: + // #addr-of + RAW_PTR_EXCLUSION const void* old_context_; +}; + +} // namespace + +ICUCharsetConverter::ICUCharsetConverter(UConverter* converter) + : converter_(converter) { +} + +ICUCharsetConverter::~ICUCharsetConverter() = default; + +void ICUCharsetConverter::ConvertFromUTF16(const char16_t* input, + int input_len, + CanonOutput* output) { + // Install our error handler. It will be called for character that can not + // be represented in the destination character set. + AppendHandlerInstaller handler(converter_); + + int begin_offset = output->length(); + int dest_capacity = output->capacity() - begin_offset; + output->set_length(output->length()); + + do { + UErrorCode err = U_ZERO_ERROR; + char* dest = &output->data()[begin_offset]; + int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity, + input, input_len, &err); + if (err != U_BUFFER_OVERFLOW_ERROR) { + output->set_length(begin_offset + required_capacity); + return; + } + + // Output didn't fit, expand + dest_capacity = required_capacity; + output->Resize(begin_offset + dest_capacity); + } while (true); +} + +} // namespace url diff --git a/url_canon_icu.h b/url_canon_icu.h new file mode 100644 index 00000000000..cb5da7d37b0 --- /dev/null +++ b/url_canon_icu.h @@ -0,0 +1,41 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_ICU_H_ +#define URL_URL_CANON_ICU_H_ + +// ICU integration functions. + +#include "base/compiler_specific.h" +#include "base/component_export.h" +#include "base/memory/raw_ptr.h" +#include "url/url_canon.h" + +typedef struct UConverter UConverter; + +namespace url { + +// An implementation of CharsetConverter that implementations can use to +// interface the canonicalizer with ICU's conversion routines. +class COMPONENT_EXPORT(URL) ICUCharsetConverter : public CharsetConverter { + public: + // Constructs a converter using an already-existing ICU character set + // converter. This converter is NOT owned by this object; the lifetime must + // be managed by the creator such that it is alive as long as this is. + ICUCharsetConverter(UConverter* converter); + + ~ICUCharsetConverter() override; + + void ConvertFromUTF16(const char16_t* input, + int input_len, + CanonOutput* output) override; + + private: + // The ICU converter, not owned by this class. + raw_ptr converter_; +}; + +} // namespace url + +#endif // URL_URL_CANON_ICU_H_ diff --git a/url_canon_icu_unittest.cc b/url_canon_icu_unittest.cc new file mode 100644 index 00000000000..336da3f20b2 --- /dev/null +++ b/url_canon_icu_unittest.cc @@ -0,0 +1,168 @@ +// Copyright 2014 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon_icu.h" + +#include + +#include "base/logging.h" +#include "base/memory/raw_ptr.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "third_party/icu/source/common/unicode/ucnv.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_test_utils.h" + +namespace url { + +namespace { + +// Wrapper around a UConverter object that managers creation and destruction. +class UConvScoper { + public: + explicit UConvScoper(const char* charset_name) { + UErrorCode err = U_ZERO_ERROR; + converter_ = ucnv_open(charset_name, &err); + if (!converter_) { + LOG(ERROR) << "Failed to open charset " << charset_name << ": " + << u_errorName(err); + } + } + + ~UConvScoper() { + if (converter_) + ucnv_close(converter_.ExtractAsDangling()); + } + + // Returns the converter object, may be NULL. + UConverter* converter() const { return converter_; } + + private: + raw_ptr converter_; +}; + +TEST(URLCanonIcuTest, ICUCharsetConverter) { + struct ICUCase { + const wchar_t* input; + const char* encoding; + const char* expected; + } icu_cases[] = { + // UTF-8. + {L"Hello, world", "utf-8", "Hello, world"}, + {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"}, + // Non-BMP UTF-8. + {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"}, + // Big5 + {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"}, + // Unrepresentable character in the destination set. + {L"hello\x4f60\x06de\x597dworld", "big5", + "hello\xa7\x41%26%231758%3B\xa6\x6eworld"}, + }; + + for (size_t i = 0; i < std::size(icu_cases); i++) { + UConvScoper conv(icu_cases[i].encoding); + ASSERT_TRUE(conv.converter() != NULL); + ICUCharsetConverter converter(conv.converter()); + + std::string str; + StdStringCanonOutput output(&str); + + std::u16string input_str( + test_utils::TruncateWStringToUTF16(icu_cases[i].input)); + int input_len = static_cast(input_str.length()); + converter.ConvertFromUTF16(input_str.c_str(), input_len, &output); + output.Complete(); + + EXPECT_STREQ(icu_cases[i].expected, str.c_str()); + } + + // Test string sizes around the resize boundary for the output to make sure + // the converter resizes as needed. + const int static_size = 16; + UConvScoper conv("utf-8"); + ASSERT_TRUE(conv.converter()); + ICUCharsetConverter converter(conv.converter()); + for (int i = static_size - 2; i <= static_size + 2; i++) { + // Make a string with the appropriate length. + std::u16string input; + for (int ch = 0; ch < i; ch++) + input.push_back('a'); + + RawCanonOutput output; + converter.ConvertFromUTF16(input.c_str(), static_cast(input.length()), + &output); + EXPECT_EQ(input.length(), output.length()); + } +} + +TEST(URLCanonIcuTest, QueryWithConverter) { + struct QueryCase { + const char* input8; + const wchar_t* input16; + const char* encoding; + const char* expected; + } query_cases[] = { + // Regular ASCII case in some different encodings. + {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"}, + {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"}, + {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"}, + // Chinese input/output + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", + "?q=%C4%E3%BA%C3"}, + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"}, + // Unencodable character in the destination character set should be + // escaped. The escape sequence unescapes to be the entity name: + // "?q=你" + {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", + "?q=Chinese%26%2365319%3B"}, + }; + + for (size_t i = 0; i < std::size(query_cases); i++) { + Component out_comp; + + UConvScoper conv(query_cases[i].encoding); + ASSERT_TRUE(!query_cases[i].encoding || conv.converter()); + ICUCharsetConverter converter(conv.converter()); + + if (query_cases[i].input8) { + int len = static_cast(strlen(query_cases[i].input8)); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + + if (query_cases[i].input16) { + std::u16string input16( + test_utils::TruncateWStringToUTF16(query_cases[i].input16)); + int len = static_cast(input16.length()); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + } + + // Extra test for input with embedded NULL; + std::string out_str; + StdStringCanonOutput output(&out_str); + Component out_comp; + CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); + output.Complete(); + EXPECT_EQ("?a%20%00z%01", out_str); +} + +} // namespace + +} // namespace url diff --git a/url_canon_internal.cc b/url_canon_internal.cc new file mode 100644 index 00000000000..f6219209e72 --- /dev/null +++ b/url_canon_internal.cc @@ -0,0 +1,502 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon_internal.h" + +#include +#include +#include +#ifdef __SSE2__ +#include +#elif defined(__aarch64__) +#include +#endif + +#include +#include + +#include "base/bits.h" +#include "base/numerics/safe_conversions.h" +#include "base/strings/utf_string_conversion_utils.h" + +namespace url { + +namespace { + +// Find the initial segment of the given string that consists solely +// of characters valid for CHAR_QUERY. (We can have false negatives in +// one specific case, namely the exclamation mark 0x21, but false negatives +// are fine, and it's not worth adding a separate test for.) This is +// a fast path to speed up checking of very long query strings that are +// already valid, which happen on some web pages. +// +// This has some startup cost to load the constants and such, so it's +// usually not worth it for short strings. +size_t FindInitialQuerySafeString(const char* source, size_t length) { +#if defined(__SSE2__) || defined(__aarch64__) + constexpr size_t kChunkSize = 16; + size_t i; + for (i = 0; i < base::bits::AlignDown(length, kChunkSize); i += kChunkSize) { + char b __attribute__((vector_size(16))); + memcpy(&b, source + i, sizeof(b)); + + // Compare each element with the ranges for CHAR_QUERY + // (see kSharedCharTypeTable), vectorized so that it creates + // a mask of which elements match. For completeness, we could + // have had (...) | b == 0x21 here, but exclamation marks are + // rare and the extra test costs us some time. + auto mask = b >= 0x24 && b <= 0x7e && b != 0x27 && b != 0x3c && b != 0x3e; + +#ifdef __SSE2__ + if (_mm_movemask_epi8(reinterpret_cast<__m128i>(mask)) != 0xffff) { + return i; + } +#else + if (vminvq_u8(reinterpret_cast(mask)) == 0) { + return i; + } +#endif + } + return i; +#else + // Need SIMD support (with fast reductions) for this to be efficient. + return 0; +#endif +} + +template +void DoAppendStringOfType(const CHAR* source, + size_t length, + SharedCharTypes type, + CanonOutput* output) { + size_t i = 0; + // We only instantiate this for char, to avoid a Clang crash + // (and because Append() does not support converting). + if constexpr (sizeof(CHAR) == 1) { + if (type == CHAR_QUERY && length >= kMinimumLengthForSIMD) { + i = FindInitialQuerySafeString(source, length); + output->Append(source, i); + } + } + for (; i < length; i++) { + if (static_cast(source[i]) >= 0x80) { + // ReadChar will fill the code point with kUnicodeReplacementCharacter + // when the input is invalid, which is what we want. + base_icu::UChar32 code_point; + ReadUTFChar(source, &i, length, &code_point); + AppendUTF8EscapedValue(code_point, output); + } else { + // Just append the 7-bit character, possibly escaping it. + unsigned char uch = static_cast(source[i]); + if (!IsCharOfType(uch, type)) + AppendEscapedChar(uch, output); + else + output->push_back(uch); + } + } +} + +// This function assumes the input values are all contained in 8-bit, +// although it allows any type. Returns true if input is valid, false if not. +template +void DoAppendInvalidNarrowString(const CHAR* spec, + size_t begin, + size_t end, + CanonOutput* output) { + for (size_t i = begin; i < end; i++) { + UCHAR uch = static_cast(spec[i]); + if (uch >= 0x80) { + // Handle UTF-8/16 encodings. This call will correctly handle the error + // case by appending the invalid character. + AppendUTF8EscapedChar(spec, &i, end, output); + } else if (uch <= ' ' || uch == 0x7f) { + // This function is for error handling, so we escape all control + // characters and spaces, but not anything else since we lack + // context to do something more specific. + AppendEscapedChar(static_cast(uch), output); + } else { + output->push_back(static_cast(uch)); + } + } +} + +// Overrides one component, see the Replacements structure for +// what the various combionations of source pointer and component mean. +void DoOverrideComponent(const char* override_source, + const Component& override_component, + const char** dest, + Component* dest_component) { + if (override_source) { + *dest = override_source; + *dest_component = override_component; + } +} + +// Similar to DoOverrideComponent except that it takes a UTF-16 input and does +// not actually set the output character pointer. +// +// The input is converted to UTF-8 at the end of the given buffer as a temporary +// holding place. The component identifying the portion of the buffer used in +// the |utf8_buffer| will be specified in |*dest_component|. +// +// This will not actually set any |dest| pointer like DoOverrideComponent +// does because all of the pointers will point into the |utf8_buffer|, which +// may get resized while we're overriding a subsequent component. Instead, the +// caller should use the beginning of the |utf8_buffer| as the string pointer +// for all components once all overrides have been prepared. +bool PrepareUTF16OverrideComponent(const char16_t* override_source, + const Component& override_component, + CanonOutput* utf8_buffer, + Component* dest_component) { + bool success = true; + if (override_source) { + if (!override_component.is_valid()) { + // Non-"valid" component (means delete), so we need to preserve that. + *dest_component = Component(); + } else { + // Convert to UTF-8. + dest_component->begin = utf8_buffer->length(); + success = ConvertUTF16ToUTF8(&override_source[override_component.begin], + static_cast(override_component.len), + utf8_buffer); + dest_component->len = utf8_buffer->length() - dest_component->begin; + } + } + return success; +} + +} // namespace + +// See the header file for this array's declaration. +// clang-format off +const unsigned char kSharedCharTypeTable[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f + 0, // 0x20 ' ' (escape spaces in queries) + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x21 ! + 0, // 0x22 " + 0, // 0x23 # (invalid in query since it marks the ref) + CHAR_QUERY | CHAR_USERINFO, // 0x24 $ + CHAR_QUERY | CHAR_USERINFO, // 0x25 % + CHAR_QUERY | CHAR_USERINFO, // 0x26 & + 0, // 0x27 ' (Try to prevent XSS.) + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x28 ( + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x29 ) + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2a * + CHAR_QUERY | CHAR_USERINFO, // 0x2b + + CHAR_QUERY | CHAR_USERINFO, // 0x2c , + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2d - + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x2e . + CHAR_QUERY, // 0x2f / + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x30 0 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x31 1 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x32 2 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x33 3 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x34 4 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x35 5 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x36 6 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x37 7 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x38 8 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x39 9 + CHAR_QUERY, // 0x3a : + CHAR_QUERY, // 0x3b ; + 0, // 0x3c < (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3d = + 0, // 0x3e > (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3f ? + CHAR_QUERY, // 0x40 @ + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x41 A + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x42 B + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x43 C + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x44 D + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x45 E + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x46 F + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x47 G + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x48 H + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x49 I + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4a J + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4b K + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4c L + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4d M + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4e N + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4f O + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x50 P + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x51 Q + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x52 R + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x53 S + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x54 T + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x55 U + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x56 V + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x57 W + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58 X + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x59 Y + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5a Z + CHAR_QUERY, // 0x5b [ + CHAR_QUERY, // 0x5c '\' + CHAR_QUERY, // 0x5d ] + CHAR_QUERY, // 0x5e ^ + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5f _ + CHAR_QUERY, // 0x60 ` + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x61 a + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x62 b + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x63 c + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x64 d + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x65 e + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x66 f + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x67 g + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x68 h + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x69 i + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6a j + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6b k + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6c l + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6d m + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6e n + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6f o + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x70 p + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x71 q + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x72 r + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x73 s + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x74 t + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x75 u + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x76 v + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x77 w + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x78 x + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x79 y + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7a z + CHAR_QUERY, // 0x7b { + CHAR_QUERY, // 0x7c | + CHAR_QUERY, // 0x7d } + CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7e ~ + 0, // 0x7f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff +}; +// clang-format on + +const char kHexCharLookup[0x10] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', +}; + +const char kCharToHexLookup[8] = { + 0, // 0x00 - 0x1f + '0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39 + 'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46 + 'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66 + 0, // 0x80 - 0x9F + 0, // 0xA0 - 0xBF + 0, // 0xC0 - 0xDF + 0, // 0xE0 - 0xFF +}; + +const base_icu::UChar32 kUnicodeReplacementCharacter = 0xfffd; + +void AppendStringOfType(const char* source, + size_t length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType(source, length, type, output); +} + +void AppendStringOfType(const char16_t* source, + size_t length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType(source, length, type, output); +} + +bool ReadUTFChar(const char* str, + size_t* begin, + size_t length, + base_icu::UChar32* code_point_out) { + if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) || + !base::IsValidCharacter(*code_point_out)) { + *code_point_out = kUnicodeReplacementCharacter; + return false; + } + return true; +} + +bool ReadUTFChar(const char16_t* str, + size_t* begin, + size_t length, + base_icu::UChar32* code_point_out) { + if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) || + !base::IsValidCharacter(*code_point_out)) { + *code_point_out = kUnicodeReplacementCharacter; + return false; + } + return true; +} + +void AppendInvalidNarrowString(const char* spec, + size_t begin, + size_t end, + CanonOutput* output) { + DoAppendInvalidNarrowString(spec, begin, end, output); +} + +void AppendInvalidNarrowString(const char16_t* spec, + size_t begin, + size_t end, + CanonOutput* output) { + DoAppendInvalidNarrowString(spec, begin, end, output); +} + +bool ConvertUTF16ToUTF8(const char16_t* input, + size_t input_len, + CanonOutput* output) { + bool success = true; + for (size_t i = 0; i < input_len; i++) { + base_icu::UChar32 code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF8Value(code_point, output); + } + return success; +} + +bool ConvertUTF8ToUTF16(const char* input, + size_t input_len, + CanonOutputT* output) { + bool success = true; + for (size_t i = 0; i < input_len; i++) { + base_icu::UChar32 code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF16Value(code_point, output); + } + return success; +} + +void SetupOverrideComponents(const char* base, + const Replacements& repl, + URLComponentSource* source, + Parsed* parsed) { + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource& repl_source = repl.sources(); + const Parsed& repl_parsed = repl.components(); + + DoOverrideComponent(repl_source.scheme, repl_parsed.scheme, &source->scheme, + &parsed->scheme); + DoOverrideComponent(repl_source.username, repl_parsed.username, + &source->username, &parsed->username); + DoOverrideComponent(repl_source.password, repl_parsed.password, + &source->password, &parsed->password); + + // Our host should be empty if not present, so override the default setup. + DoOverrideComponent(repl_source.host, repl_parsed.host, &source->host, + &parsed->host); + if (parsed->host.len == -1) + parsed->host.len = 0; + + DoOverrideComponent(repl_source.port, repl_parsed.port, &source->port, + &parsed->port); + DoOverrideComponent(repl_source.path, repl_parsed.path, &source->path, + &parsed->path); + DoOverrideComponent(repl_source.query, repl_parsed.query, &source->query, + &parsed->query); + DoOverrideComponent(repl_source.ref, repl_parsed.ref, &source->ref, + &parsed->ref); +} + +bool SetupUTF16OverrideComponents(const char* base, + const Replacements& repl, + CanonOutput* utf8_buffer, + URLComponentSource* source, + Parsed* parsed) { + bool success = true; + + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource& repl_source = repl.sources(); + const Parsed& repl_parsed = repl.components(); + + success &= PrepareUTF16OverrideComponent( + repl_source.scheme, repl_parsed.scheme, utf8_buffer, &parsed->scheme); + success &= + PrepareUTF16OverrideComponent(repl_source.username, repl_parsed.username, + utf8_buffer, &parsed->username); + success &= + PrepareUTF16OverrideComponent(repl_source.password, repl_parsed.password, + utf8_buffer, &parsed->password); + success &= PrepareUTF16OverrideComponent(repl_source.host, repl_parsed.host, + utf8_buffer, &parsed->host); + success &= PrepareUTF16OverrideComponent(repl_source.port, repl_parsed.port, + utf8_buffer, &parsed->port); + success &= PrepareUTF16OverrideComponent(repl_source.path, repl_parsed.path, + utf8_buffer, &parsed->path); + success &= PrepareUTF16OverrideComponent(repl_source.query, repl_parsed.query, + utf8_buffer, &parsed->query); + success &= PrepareUTF16OverrideComponent(repl_source.ref, repl_parsed.ref, + utf8_buffer, &parsed->ref); + + // PrepareUTF16OverrideComponent will not have set the data pointer since the + // buffer could be resized, invalidating the pointers. We set the data + // pointers for affected components now that the buffer is finalized. + if (repl_source.scheme) + source->scheme = utf8_buffer->data(); + if (repl_source.username) + source->username = utf8_buffer->data(); + if (repl_source.password) + source->password = utf8_buffer->data(); + if (repl_source.host) + source->host = utf8_buffer->data(); + if (repl_source.port) + source->port = utf8_buffer->data(); + if (repl_source.path) + source->path = utf8_buffer->data(); + if (repl_source.query) + source->query = utf8_buffer->data(); + if (repl_source.ref) + source->ref = utf8_buffer->data(); + + return success; +} + +#ifndef WIN32 + +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) { + const char* format_str; + if (radix == 10) + format_str = "%d"; + else if (radix == 16) + format_str = "%x"; + else + return EINVAL; + + int written = snprintf(buffer, size_in_chars, format_str, value); + if (static_cast(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + return 0; +} + +int _itow_s(int value, char16_t* buffer, size_t size_in_chars, int radix) { + if (radix != 10) + return EINVAL; + + // No more than 12 characters will be required for a 32-bit integer. + // Add an extra byte for the terminating null. + char temp[13]; + int written = snprintf(temp, sizeof(temp), "%d", value); + if (static_cast(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + + for (int i = 0; i < written; ++i) { + buffer[i] = static_cast(temp[i]); + } + buffer[written] = '\0'; + return 0; +} + +#endif // !WIN32 + +} // namespace url diff --git a/url_canon_internal.h b/url_canon_internal.h new file mode 100644 index 00000000000..13481f5fdb6 --- /dev/null +++ b/url_canon_internal.h @@ -0,0 +1,471 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_INTERNAL_H_ +#define URL_URL_CANON_INTERNAL_H_ + +// This file is intended to be included in another C++ file where the character +// types are defined. This allows us to write mostly generic code, but not have +// template bloat because everything is inlined when anybody calls any of our +// functions. + +#include +#include + +#include "base/component_export.h" +#include "base/notreached.h" +#include "base/third_party/icu/icu_utf.h" +#include "url/url_canon.h" + +namespace url { + +// Character type handling ----------------------------------------------------- + +// Bits that identify different character types. These types identify different +// bits that are set for each 8-bit character in the kSharedCharTypeTable. +enum SharedCharTypes { + // Characters that do not require escaping in queries. Characters that do + // not have this flag will be escaped; see url_canon_query.cc + CHAR_QUERY = 1, + + // Valid in the username/password field. + CHAR_USERINFO = 2, + + // Valid in a IPv4 address (digits plus dot and 'x' for hex). + CHAR_IPV4 = 4, + + // Valid in an ASCII-representation of a hex digit (as in %-escaped). + CHAR_HEX = 8, + + // Valid in an ASCII-representation of a decimal digit. + CHAR_DEC = 16, + + // Valid in an ASCII-representation of an octal digit. + CHAR_OCT = 32, + + // Characters that do not require escaping in encodeURIComponent. Characters + // that do not have this flag will be escaped; see url_util.cc. + CHAR_COMPONENT = 64, +}; + +// This table contains the flags in SharedCharTypes for each 8-bit character. +// Some canonicalization functions have their own specialized lookup table. +// For those with simple requirements, we have collected the flags in one +// place so there are fewer lookup tables to load into the CPU cache. +// +// Using an unsigned char type has a small but measurable performance benefit +// over using a 32-bit number. +extern const unsigned char kSharedCharTypeTable[0x100]; + +// More readable wrappers around the character type lookup table. +inline bool IsCharOfType(unsigned char c, SharedCharTypes type) { + return !!(kSharedCharTypeTable[c] & type); +} +inline bool IsQueryChar(unsigned char c) { + return IsCharOfType(c, CHAR_QUERY); +} +inline bool IsIPv4Char(unsigned char c) { + return IsCharOfType(c, CHAR_IPV4); +} +inline bool IsHexChar(unsigned char c) { + return IsCharOfType(c, CHAR_HEX); +} +inline bool IsComponentChar(unsigned char c) { + return IsCharOfType(c, CHAR_COMPONENT); +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. +void AppendStringOfType(const char* source, + size_t length, + SharedCharTypes type, + CanonOutput* output); +void AppendStringOfType(const char16_t* source, + size_t length, + SharedCharTypes type, + CanonOutput* output); + +// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit +// that will be used to represent it. +COMPONENT_EXPORT(URL) extern const char kHexCharLookup[0x10]; + +// This lookup table allows fast conversion between ASCII hex letters and their +// corresponding numerical value. The 8-bit range is divided up into 8 +// regions of 0x20 characters each. Each of the three character types (numbers, +// uppercase, lowercase) falls into different regions of this range. The table +// contains the amount to subtract from characters in that range to get at +// the corresponding numerical value. +// +// See HexDigitToValue for the lookup. +extern const char kCharToHexLookup[8]; + +// Assumes the input is a valid hex digit! Call IsHexChar before using this. +inline int HexCharToValue(unsigned char c) { + return c - kCharToHexLookup[c / 0x20]; +} + +// Indicates if the given character is a dot or dot equivalent, returning the +// number of characters taken by it. This will be one for a literal dot, 3 for +// an escaped dot. If the character is not a dot, this will return 0. +template +inline size_t IsDot(const CHAR* spec, size_t offset, size_t end) { + if (spec[offset] == '.') { + return 1; + } else if (spec[offset] == '%' && offset + 3 <= end && + spec[offset + 1] == '2' && + (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) { + // Found "%2e" + return 3; + } + return 0; +} + +// Returns the canonicalized version of the input character according to scheme +// rules. This is implemented alongside the scheme canonicalizer, and is +// required for relative URL resolving to test for scheme equality. +// +// Returns 0 if the input character is not a valid scheme character. +char CanonicalSchemeChar(char16_t ch); + +// Write a single character, escaped, to the output. This always escapes: it +// does no checking that thee character requires escaping. +// Escaping makes sense only 8 bit chars, so code works in all cases of +// input parameters (8/16bit). +template +inline void AppendEscapedChar(UINCHAR ch, CanonOutputT* output) { + output->push_back('%'); + output->push_back(static_cast(kHexCharLookup[(ch >> 4) & 0xf])); + output->push_back(static_cast(kHexCharLookup[ch & 0xf])); +} + +// The character we'll substitute for undecodable or invalid characters. +extern const base_icu::UChar32 kUnicodeReplacementCharacter; + +// UTF-8 functions ------------------------------------------------------------ + +// Reads one character in UTF-8 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-byte ASCII character, it will not be changed). +COMPONENT_EXPORT(URL) +bool ReadUTFChar(const char* str, + size_t* begin, + size_t length, + base_icu::UChar32* code_point_out); + +// Generic To-UTF-8 converter. This will call the given append method for each +// character that should be appended, with the given output method. Wrappers +// are provided below for escaped and non-escaped versions of this. +// +// The char_value must have already been checked that it's a valid Unicode +// character. +template +inline void DoAppendUTF8(base_icu::UChar32 char_value, Output* output) { + DCHECK(char_value >= 0); + DCHECK(char_value <= 0x10FFFF); + if (char_value <= 0x7f) { + Appender(static_cast(char_value), output); + } else if (char_value <= 0x7ff) { + // 110xxxxx 10xxxxxx + Appender(static_cast(0xC0 | (char_value >> 6)), output); + Appender(static_cast(0x80 | (char_value & 0x3f)), output); + } else if (char_value <= 0xffff) { + // 1110xxxx 10xxxxxx 10xxxxxx + Appender(static_cast(0xe0 | (char_value >> 12)), output); + Appender(static_cast(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast(0x80 | (char_value & 0x3f)), output); + } else { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast(0xf0 | (char_value >> 18)), output); + Appender(static_cast(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast(0x80 | (char_value & 0x3f)), output); + } +} + +// Helper used by AppendUTF8Value below. We use an unsigned parameter so there +// are no funny sign problems with the input, but then have to convert it to +// a regular char for appending. +inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { + output->push_back(static_cast(ch)); +} + +// Writes the given character to the output as UTF-8. This does NO checking +// of the validity of the Unicode characters; the caller should ensure that +// the value it is appending is valid to append. +inline void AppendUTF8Value(base_icu::UChar32 char_value, CanonOutput* output) { + DoAppendUTF8(char_value, output); +} + +// Writes the given character to the output as UTF-8, escaping ALL +// characters (even when they are ASCII). This does NO checking of the +// validity of the Unicode characters; the caller should ensure that the value +// it is appending is valid to append. +inline void AppendUTF8EscapedValue(base_icu::UChar32 char_value, + CanonOutput* output) { + DoAppendUTF8(char_value, output); +} + +// UTF-16 functions ----------------------------------------------------------- + +// Reads one character in UTF-16 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-16-bit-word character, it will not be changed). +COMPONENT_EXPORT(URL) +bool ReadUTFChar(const char16_t* str, + size_t* begin, + size_t length, + base_icu::UChar32* code_point_out); + +// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. +inline void AppendUTF16Value(base_icu::UChar32 code_point, + CanonOutputT* output) { + if (code_point > 0xffff) { + output->push_back(static_cast((code_point >> 10) + 0xd7c0)); + output->push_back(static_cast((code_point & 0x3ff) | 0xdc00)); + } else { + output->push_back(static_cast(code_point)); + } +} + +// Escaping functions --------------------------------------------------------- + +// Writes the given character to the output as UTF-8, escaped. Call this +// function only when the input is wide. Returns true on success. Failure +// means there was some problem with the encoding, we'll still try to +// update the |*begin| pointer and add a placeholder character to the +// output so processing can continue. +// +// We will append the character starting at ch[begin] with the buffer ch +// being |length|. |*begin| will be updated to point to the last character +// consumed (we may consume more than one for UTF-16) so that if called in +// a loop, incrementing the pointer will move to the next character. +// +// Every single output character will be escaped. This means that if you +// give it an ASCII character as input, it will be escaped. Some code uses +// this when it knows that a character is invalid according to its rules +// for validity. If you don't want escaping for ASCII characters, you will +// have to filter them out prior to calling this function. +// +// Assumes that ch[begin] is within range in the array, but does not assume +// that any following characters are. +inline bool AppendUTF8EscapedChar(const char16_t* str, + size_t* begin, + size_t length, + CanonOutput* output) { + // UTF-16 input. ReadUTFChar will handle invalid characters for us and give + // us the kUnicodeReplacementCharacter, so we don't have to do special + // checking after failure, just pass through the failure to the caller. + base_icu::UChar32 char_value; + bool success = ReadUTFChar(str, begin, length, &char_value); + AppendUTF8EscapedValue(char_value, output); + return success; +} + +// Handles UTF-8 input. See the wide version above for usage. +inline bool AppendUTF8EscapedChar(const char* str, + size_t* begin, + size_t length, + CanonOutput* output) { + // ReadUTF8Char will handle invalid characters for us and give us the + // kUnicodeReplacementCharacter, so we don't have to do special checking + // after failure, just pass through the failure to the caller. + base_icu::UChar32 ch; + bool success = ReadUTFChar(str, begin, length, &ch); + AppendUTF8EscapedValue(ch, output); + return success; +} + +// Given a '%' character at |*begin| in the string |spec|, this will decode +// the escaped value and put it into |*unescaped_value| on success (returns +// true). On failure, this will return false, and will not write into +// |*unescaped_value|. +// +// |*begin| will be updated to point to the last character of the escape +// sequence so that when called with the index of a for loop, the next time +// through it will point to the next character to be considered. On failure, +// |*begin| will be unchanged. +inline bool Is8BitChar(char c) { + return true; // this case is specialized to avoid a warning +} +inline bool Is8BitChar(char16_t c) { + return c <= 255; +} + +template +inline bool DecodeEscaped(const CHAR* spec, + size_t* begin, + size_t end, + unsigned char* unescaped_value) { + if (*begin + 3 > end || !Is8BitChar(spec[*begin + 1]) || + !Is8BitChar(spec[*begin + 2])) { + // Invalid escape sequence because there's not enough room, or the + // digits are not ASCII. + return false; + } + + unsigned char first = static_cast(spec[*begin + 1]); + unsigned char second = static_cast(spec[*begin + 2]); + if (!IsHexChar(first) || !IsHexChar(second)) { + // Invalid hex digits, fail. + return false; + } + + // Valid escape sequence. + *unescaped_value = static_cast((HexCharToValue(first) << 4) + + HexCharToValue(second)); + *begin += 2; + return true; +} + +// Appends the given substring to the output, escaping "some" characters that +// it feels may not be safe. It assumes the input values are all contained in +// 8-bit although it allows any type. +// +// This is used in error cases to append invalid output so that it looks +// approximately correct. Non-error cases should not call this function since +// the escaping rules are not guaranteed! +void AppendInvalidNarrowString(const char* spec, + size_t begin, + size_t end, + CanonOutput* output); +void AppendInvalidNarrowString(const char16_t* spec, + size_t begin, + size_t end, + CanonOutput* output); + +// Misc canonicalization helpers ---------------------------------------------- + +// Converts between UTF-8 and UTF-16, returning true on successful conversion. +// The output will be appended to the given canonicalizer output (so make sure +// it's empty if you want to replace). +// +// On invalid input, this will still write as much output as possible, +// replacing the invalid characters with the "invalid character". It will +// return false in the failure case, and the caller should not continue as +// normal. +COMPONENT_EXPORT(URL) +bool ConvertUTF16ToUTF8(const char16_t* input, + size_t input_len, + CanonOutput* output); +COMPONENT_EXPORT(URL) +bool ConvertUTF8ToUTF16(const char* input, + size_t input_len, + CanonOutputT* output); + +// Converts from UTF-16 to 8-bit using the character set converter. If the +// converter is NULL, this will use UTF-8. +void ConvertUTF16ToQueryEncoding(const char16_t* input, + const Component& query, + CharsetConverter* converter, + CanonOutput* output); + +// Applies the replacements to the given component source. The component source +// should be pre-initialized to the "old" base. That is, all pointers will +// point to the spec of the old URL, and all of the Parsed components will +// be indices into that string. +// +// The pointers and components in the |source| for all non-NULL strings in the +// |repl| (replacements) will be updated to reference those strings. +// Canonicalizing with the new |source| and |parsed| can then combine URL +// components from many different strings. +void SetupOverrideComponents(const char* base, + const Replacements& repl, + URLComponentSource* source, + Parsed* parsed); + +// Like the above 8-bit version, except that it additionally converts the +// UTF-16 input to UTF-8 before doing the overrides. +// +// The given utf8_buffer is used to store the converted components. They will +// be appended one after another, with the parsed structure identifying the +// appropriate substrings. This buffer is a parameter because the source has +// no storage, so the buffer must have the same lifetime as the source +// parameter owned by the caller. +// +// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of +// |source| will point into this buffer, which could be invalidated if +// additional data is added and the CanonOutput resizes its buffer. +// +// Returns true on success. False means that the input was not valid UTF-16, +// although we will have still done the override with "invalid characters" in +// place of errors. +bool SetupUTF16OverrideComponents(const char* base, + const Replacements& repl, + CanonOutput* utf8_buffer, + URLComponentSource* source, + Parsed* parsed); + +// Implemented in url_canon_path.cc, these are required by the relative URL +// resolver as well, so we declare them here. +bool CanonicalizePartialPathInternal(const char* spec, + const Component& path, + size_t path_begin_in_output, + CanonOutput* output); +bool CanonicalizePartialPathInternal(const char16_t* spec, + const Component& path, + size_t path_begin_in_output, + CanonOutput* output); + +// Find the position of a bona fide Windows drive letter in the given path. If +// no leading drive letter is found, -1 is returned. This function correctly +// treats /c:/foo and /./c:/foo as having drive letters, and /def/c:/foo as not +// having a drive letter. +// +// Exported for tests. +COMPONENT_EXPORT(URL) +int FindWindowsDriveLetter(const char* spec, int begin, int end); +COMPONENT_EXPORT(URL) +int FindWindowsDriveLetter(const char16_t* spec, int begin, int end); + +#ifndef WIN32 + +// Implementations of Windows' int-to-string conversions +COMPONENT_EXPORT(URL) +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix); +COMPONENT_EXPORT(URL) +int _itow_s(int value, char16_t* buffer, size_t size_in_chars, int radix); + +// Secure template overloads for these functions +template +inline int _itoa_s(int value, char (&buffer)[N], int radix) { + return _itoa_s(value, buffer, N, radix); +} + +template +inline int _itow_s(int value, char16_t (&buffer)[N], int radix) { + return _itow_s(value, buffer, N, radix); +} + +// _strtoui64 and strtoull behave the same +inline unsigned long long _strtoui64(const char* nptr, + char** endptr, + int base) { + return strtoull(nptr, endptr, base); +} + +#endif // WIN32 + +// The threshold we set to consider SIMD processing, in bytes; there is +// no deep theory here, it's just set empirically to a value that seems +// to be good. (We don't really know why there's a slowdown for zero; +// but a guess would be that there's no need in going into a complex loop +// with a lot of setup for a five-byte string.) +static constexpr int kMinimumLengthForSIMD = 50; + +} // namespace url + +#endif // URL_URL_CANON_INTERNAL_H_ diff --git a/url_canon_internal_file.h b/url_canon_internal_file.h new file mode 100644 index 00000000000..32cb84096b2 --- /dev/null +++ b/url_canon_internal_file.h @@ -0,0 +1,135 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_INTERNAL_FILE_H_ +#define URL_URL_CANON_INTERNAL_FILE_H_ + +// As with url_canon_internal.h, this file is intended to be included in +// another C++ file where the template types are defined. This allows the +// programmer to use this to use these functions for their own strings +// types, without bloating the code by having inline templates used in +// every call site. +// +// *** This file must be included after url_canon_internal as we depend on some +// functions in it. *** + +#include "base/strings/string_util.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +namespace url { + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template +static int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (base::IsAsciiLower(spec[after_slashes])) + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + output->push_back('/'); + return after_slashes + 2; +} + +// FileDoDriveSpec will have already added the first backslash, so we need to +// write everything following the slashes using the path canonicalizer. +template +static void FileDoPath(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // Normalize the number of slashes after the drive letter. The path + // canonicalizer expects the input to begin in a slash already so + // doesn't check. We want to handle no-slashes + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + // Now use the regular path canonicalizer to canonicalize the rest of the + // path. We supply it with the path following the slashes. It won't prepend + // a slash because it assumes any nonempty path already starts with one. + // We explicitly filter out calls with no path here to prevent that case. + ParsedComponent sub_path(after_slashes, end - after_slashes); + if (sub_path.len > 0) { + // Give it a fake output component to write into. DoCanonicalizeFile will + // compute the full path component. + ParsedComponent fake_output_path; + URLCanonInternal::DoPath( + spec, sub_path, output, &fake_output_path); + } +} + +template +static bool DoCanonicalizeFileURL(const URLComponentSource& source, + const ParsedURL& parsed, + CanonOutput* output, + ParsedURL* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = ParsedComponent(0, -1); + new_parsed->password = ParsedComponent(0, -1); + new_parsed->port = ParsedComponent(0, -1); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->push_back('f'); + output->push_back('i'); + output->push_back('l'); + output->push_back('e'); + new_parsed->scheme.len = output->length() - new_parsed->scheme.begin; + output->push_back(':'); + + // Write the separator for the host. + output->push_back('/'); + output->push_back('/'); + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = URLCanonInternal::DoHost( + source.host, parsed.host, output, &new_parsed->host); + + // Write a separator for the start of the path. We'll ignore any slashes + // already at the beginning of the path. + new_parsed->path.begin = output->length(); + output->push_back('/'); + + // Copy and normalize the "c:" at the beginning, if present. + int after_drive = FileDoDriveSpec(source.path, parsed.path.begin, + parsed.path.end(), output); + + // Copy the rest of the path. + FileDoPath(source.path, after_drive, parsed.path.end(), output); + new_parsed->path.len = output->length() - new_parsed->path.begin; + + // For things following the path, we can use the standard canonicalizers. + success &= URLCanonInternal::DoQuery( + source.query, parsed.query, output, &new_parsed->query); + success &= URLCanonInternal::DoRef( + source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace url + +#endif // URL_URL_CANON_INTERNAL_FILE_H_ diff --git a/url_canon_ip.cc b/url_canon_ip.cc new file mode 100644 index 00000000000..783ddccf852 --- /dev/null +++ b/url_canon_ip.cc @@ -0,0 +1,690 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon_ip.h" + +#include +#include + +#include + +#include "base/check.h" +#include "url/url_canon_internal.h" +#include "url/url_features.h" + +namespace url { + +namespace { + +// Converts one of the character types that represent a numerical base to the +// corresponding base. +int BaseForType(SharedCharTypes type) { + switch (type) { + case CHAR_HEX: + return 16; + case CHAR_DEC: + return 10; + case CHAR_OCT: + return 8; + default: + return 0; + } +} + +// Converts an IPv4 component to a 32-bit number, while checking for overflow. +// +// Possible return values: +// - IPV4 - The number was valid, and did not overflow. +// - BROKEN - The input was numeric, but too large for a 32-bit field. +// - NEUTRAL - Input was not numeric. +// +// The input is assumed to be ASCII. The components are assumed to be non-empty. +template +CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec, + const Component& component, + uint32_t* number) { + // Empty components are considered non-numeric. + if (component.is_empty()) + return CanonHostInfo::NEUTRAL; + + // Figure out the base + SharedCharTypes base; + int base_prefix_len = 0; // Size of the prefix for this base. + if (spec[component.begin] == '0') { + // Either hex or dec, or a standalone zero. + if (component.len == 1) { + base = CHAR_DEC; + } else if (spec[component.begin + 1] == 'X' || + spec[component.begin + 1] == 'x') { + base = CHAR_HEX; + base_prefix_len = 2; + } else { + base = CHAR_OCT; + base_prefix_len = 1; + } + } else { + base = CHAR_DEC; + } + + // Extend the prefix to consume all leading zeros. + while (base_prefix_len < component.len && + spec[component.begin + base_prefix_len] == '0') + base_prefix_len++; + + // Put the component, minus any base prefix, into a NULL-terminated buffer so + // we can call the standard library. Because leading zeros have already been + // discarded, filling the entire buffer is guaranteed to trigger the 32-bit + // overflow check. + const int kMaxComponentLen = 16; + char buf[kMaxComponentLen + 1]; // digits + '\0' + int dest_i = 0; + bool may_be_broken_octal = false; + for (int i = component.begin + base_prefix_len; i < component.end(); i++) { + if (spec[i] >= 0x80) + return CanonHostInfo::NEUTRAL; + + // We know the input is 7-bit, so convert to narrow (if this is the wide + // version of the template) by casting. + char input = static_cast(spec[i]); + + // Validate that this character is OK for the given base. + if (!IsCharOfType(input, base)) { + if (IsCharOfType(input, CHAR_DEC)) { + // Entirely numeric components with leading 0s that aren't octal are + // considered broken. + may_be_broken_octal = true; + } else { + return CanonHostInfo::NEUTRAL; + } + } + + // Fill the buffer, if there's space remaining. This check allows us to + // verify that all characters are numeric, even those that don't fit. + if (dest_i < kMaxComponentLen) + buf[dest_i++] = input; + } + + if (may_be_broken_octal) + return CanonHostInfo::BROKEN; + + buf[dest_i] = '\0'; + + // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal + // number can overflow a 64-bit number in <= 16 characters). + uint64_t num = _strtoui64(buf, NULL, BaseForType(base)); + + // Check for 32-bit overflow. + if (num > std::numeric_limits::max()) + return CanonHostInfo::BROKEN; + + // No overflow. Success! + *number = static_cast(num); + return CanonHostInfo::IPV4; +} + +// See declaration of IPv4AddressToNumber for documentation. +template +CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec, + Component host, + unsigned char address[4], + int* num_ipv4_components) { + // Ignore terminal dot, if present. + if (host.is_nonempty() && spec[host.end() - 1] == '.') + --host.len; + + // Do nothing if empty. + if (host.is_empty()) + return CanonHostInfo::NEUTRAL; + + // Read component values. The first `existing_components` of them are + // populated front to back, with the first one corresponding to the last + // component, which allows for early exit if the last component isn't a + // number. + uint32_t component_values[4]; + int existing_components = 0; + + int current_component_end = host.end(); + int current_position = current_component_end; + while (true) { + // If this is not the first character of a component, go to the next + // component. + if (current_position != host.begin && spec[current_position - 1] != '.') { + --current_position; + continue; + } + + CanonHostInfo::Family family = IPv4ComponentToNumber( + spec, + Component(current_position, current_component_end - current_position), + &component_values[existing_components]); + + // If `family` is NEUTRAL and this is the last component, return NEUTRAL. If + // `family` is NEUTRAL but not the last component, this is considered a + // BROKEN IPv4 address, as opposed to a non-IPv4 hostname. + if (family == CanonHostInfo::NEUTRAL && existing_components == 0) + return CanonHostInfo::NEUTRAL; + + if (family != CanonHostInfo::IPV4) + return CanonHostInfo::BROKEN; + + ++existing_components; + + // If this is the final component, nothing else to do. + if (current_position == host.begin) + break; + + // If there are more than 4 components, fail. + if (existing_components == 4) + return CanonHostInfo::BROKEN; + + current_component_end = current_position - 1; + --current_position; + } + + // Use `component_values` to fill out the 4-component IP address. + + // First, process all components but the last, while making sure each fits + // within an 8-bit field. + for (int i = existing_components - 1; i > 0; i--) { + if (component_values[i] > std::numeric_limits::max()) + return CanonHostInfo::BROKEN; + address[existing_components - i - 1] = + static_cast(component_values[i]); + } + + uint32_t last_value = component_values[0]; + for (int i = 3; i >= existing_components - 1; i--) { + address[i] = static_cast(last_value); + last_value >>= 8; + } + + // If the last component has residual bits, report overflow. + if (last_value != 0) + return CanonHostInfo::BROKEN; + + // Tell the caller how many components we saw. + *num_ipv4_components = existing_components; + + // Success! + return CanonHostInfo::IPV4; +} + +// Return true if we've made a final IPV4/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template +bool DoCanonicalizeIPv4Address(const CHAR* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + host_info->family = IPv4AddressToNumber( + spec, host, host_info->address, &host_info->num_ipv4_components); + + switch (host_info->family) { + case CanonHostInfo::IPV4: + // Definitely an IPv4 address. + host_info->out_host.begin = output->length(); + AppendIPv4Address(host_info->address, output); + host_info->out_host.len = output->length() - host_info->out_host.begin; + return true; + case CanonHostInfo::BROKEN: + // Definitely broken. + return true; + default: + // Could be IPv6 or a hostname. + return false; + } +} + +// Helper class that describes the main components of an IPv6 input string. +// See the following examples to understand how it breaks up an input string: +// +// [Example 1]: input = "[::aa:bb]" +// ==> num_hex_components = 2 +// ==> hex_components[0] = Component(3,2) "aa" +// ==> hex_components[1] = Component(6,2) "bb" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(0, -1) +// +// [Example 2]: input = "[1:2::3:4:5]" +// ==> num_hex_components = 5 +// ==> hex_components[0] = Component(1,1) "1" +// ==> hex_components[1] = Component(3,1) "2" +// ==> hex_components[2] = Component(6,1) "3" +// ==> hex_components[3] = Component(8,1) "4" +// ==> hex_components[4] = Component(10,1) "5" +// ==> index_of_contraction = 2 +// ==> ipv4_component = Component(0, -1) +// +// [Example 3]: input = "[::ffff:192.168.0.1]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(3,4) "ffff" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +// [Example 4]: input = "[1::]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(1,1) "1" +// ==> index_of_contraction = 1 +// ==> ipv4_component = Component(0, -1) +// +// [Example 5]: input = "[::192.168.0.1]" +// ==> num_hex_components = 0 +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +struct IPv6Parsed { + // Zero-out the parse information. + void reset() { + num_hex_components = 0; + index_of_contraction = -1; + ipv4_component.reset(); + } + + // There can be up to 8 hex components (colon separated) in the literal. + Component hex_components[8]; + + // The count of hex components present. Ranges from [0,8]. + int num_hex_components; + + // The index of the hex component that the "::" contraction precedes, or + // -1 if there is no contraction. + int index_of_contraction; + + // The range of characters which are an IPv4 literal. + Component ipv4_component; +}; + +// Parse the IPv6 input string. If parsing succeeded returns true and fills +// |parsed| with the information. If parsing failed (because the input is +// invalid) returns false. +template +bool DoParseIPv6(const CHAR* spec, const Component& host, IPv6Parsed* parsed) { + // Zero-out the info. + parsed->reset(); + + if (host.is_empty()) + return false; + + // The index for start and end of address range (no brackets). + int begin = host.begin; + int end = host.end(); + + int cur_component_begin = begin; // Start of the current component. + + // Scan through the input, searching for hex components, "::" contractions, + // and IPv4 components. + for (int i = begin; /* i <= end */; i++) { + bool is_colon = spec[i] == ':'; + bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':'; + + // We reached the end of the current component if we encounter a colon + // (separator between hex components, or start of a contraction), or end of + // input. + if (is_colon || i == end) { + int component_len = i - cur_component_begin; + + // A component should not have more than 4 hex digits. + if (component_len > 4) + return false; + + // Don't allow empty components. + if (component_len == 0) { + // The exception is when contractions appear at beginning of the + // input or at the end of the input. + if (!((is_contraction && i == begin) || (i == end && + parsed->index_of_contraction == parsed->num_hex_components))) + return false; + } + + // Add the hex component we just found to running list. + if (component_len > 0) { + // Can't have more than 8 components! + if (parsed->num_hex_components >= 8) + return false; + + parsed->hex_components[parsed->num_hex_components++] = + Component(cur_component_begin, component_len); + } + } + + if (i == end) + break; // Reached the end of the input, DONE. + + // We found a "::" contraction. + if (is_contraction) { + // There can be at most one contraction in the literal. + if (parsed->index_of_contraction != -1) + return false; + parsed->index_of_contraction = parsed->num_hex_components; + ++i; // Consume the colon we peeked. + } + + if (is_colon) { + // Colons are separators between components, keep track of where the + // current component started (after this colon). + cur_component_begin = i + 1; + } else { + if (static_cast(spec[i]) >= 0x80) + return false; // Not ASCII. + + if (!IsHexChar(static_cast(spec[i]))) { + // Regular components are hex numbers. It is also possible for + // a component to be an IPv4 address in dotted form. + if (IsIPv4Char(static_cast(spec[i]))) { + // Since IPv4 address can only appear at the end, assume the rest + // of the string is an IPv4 address. (We will parse this separately + // later). + parsed->ipv4_component = + Component(cur_component_begin, end - cur_component_begin); + break; + } else { + // The character was neither a hex digit, nor an IPv4 character. + return false; + } + } + } + } + + return true; +} + +// Verifies the parsed IPv6 information, checking that the various components +// add up to the right number of bits (hex components are 16 bits, while +// embedded IPv4 formats are 32 bits, and contractions are placeholdes for +// 16 or more bits). Returns true if sizes match up, false otherwise. On +// success writes the length of the contraction (if any) to +// |out_num_bytes_of_contraction|. +bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed, + int* out_num_bytes_of_contraction) { + // Each group of four hex digits contributes 16 bits. + int num_bytes_without_contraction = parsed.num_hex_components * 2; + + // If an IPv4 address was embedded at the end, it contributes 32 bits. + if (parsed.ipv4_component.is_valid()) + num_bytes_without_contraction += 4; + + // If there was a "::" contraction, its size is going to be: + // MAX([16bits], [128bits] - num_bytes_without_contraction). + int num_bytes_of_contraction = 0; + if (parsed.index_of_contraction != -1) { + num_bytes_of_contraction = 16 - num_bytes_without_contraction; + if (num_bytes_of_contraction < 2) + num_bytes_of_contraction = 2; + } + + // Check that the numbers add up. + if (num_bytes_without_contraction + num_bytes_of_contraction != 16) + return false; + + *out_num_bytes_of_contraction = num_bytes_of_contraction; + return true; +} + +// Converts a hex component into a number. This cannot fail since the caller has +// already verified that each character in the string was a hex digit, and +// that there were no more than 4 characters. +template +uint16_t IPv6HexComponentToNumber(const CHAR* spec, + const Component& component) { + DCHECK(component.len <= 4); + + // Copy the hex string into a C-string. + char buf[5]; + for (int i = 0; i < component.len; ++i) + buf[i] = static_cast(spec[component.begin + i]); + buf[component.len] = '\0'; + + // Convert it to a number (overflow is not possible, since with 4 hex + // characters we can at most have a 16 bit number). + return static_cast(_strtoui64(buf, NULL, 16)); +} + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +template +bool DoIPv6AddressToNumber(const CHAR* spec, + const Component& host, + unsigned char address[16]) { + // Make sure the component is bounded by '[' and ']'. + int end = host.end(); + if (host.is_empty() || spec[host.begin] != '[' || spec[end - 1] != ']') + return false; + + // Exclude the square brackets. + Component ipv6_comp(host.begin + 1, host.len - 2); + + // Parse the IPv6 address -- identify where all the colon separated hex + // components are, the "::" contraction, and the embedded IPv4 address. + IPv6Parsed ipv6_parsed; + if (!DoParseIPv6(spec, ipv6_comp, &ipv6_parsed)) + return false; + + // Do some basic size checks to make sure that the address doesn't + // specify more than 128 bits or fewer than 128 bits. This also resolves + // how may zero bytes the "::" contraction represents. + int num_bytes_of_contraction; + if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction)) + return false; + + int cur_index_in_address = 0; + + // Loop through each hex components, and contraction in order. + for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) { + // Append the contraction if it appears before this component. + if (i == ipv6_parsed.index_of_contraction) { + for (int j = 0; j < num_bytes_of_contraction; ++j) + address[cur_index_in_address++] = 0; + } + // Append the hex component's value. + if (i != ipv6_parsed.num_hex_components) { + // Get the 16-bit value for this hex component. + uint16_t number = IPv6HexComponentToNumber( + spec, ipv6_parsed.hex_components[i]); + // Append to |address|, in network byte order. + address[cur_index_in_address++] = (number & 0xFF00) >> 8; + address[cur_index_in_address++] = (number & 0x00FF); + } + } + + // If there was an IPv4 section, convert it into a 32-bit number and append + // it to |address|. + if (ipv6_parsed.ipv4_component.is_valid()) { + // Append the 32-bit number to |address|. + int num_ipv4_components = 0; + // IPv4AddressToNumber will remove the trailing dot from the component. + bool trailing_dot = ipv6_parsed.ipv4_component.is_nonempty() && + spec[ipv6_parsed.ipv4_component.end() - 1] == '.'; + // The URL standard requires the embedded IPv4 address to be concisely + // composed of 4 parts and disallows terminal dots. + // See https://url.spec.whatwg.org/#concept-ipv6-parser + if (CanonHostInfo::IPV4 != + IPv4AddressToNumber(spec, ipv6_parsed.ipv4_component, + &address[cur_index_in_address], + &num_ipv4_components)) { + return false; + } + if ((num_ipv4_components != 4 || trailing_dot) && + base::FeatureList::IsEnabled( + url::kStrictIPv4EmbeddedIPv6AddressParsing)) { + return false; + } + } + + return true; +} + +// Searches for the longest sequence of zeros in |address|, and writes the +// range into |contraction_range|. The run of zeros must be at least 16 bits, +// and if there is a tie the first is chosen. +void ChooseIPv6ContractionRange(const unsigned char address[16], + Component* contraction_range) { + // The longest run of zeros in |address| seen so far. + Component max_range; + + // The current run of zeros in |address| being iterated over. + Component cur_range; + + for (int i = 0; i < 16; i += 2) { + // Test for 16 bits worth of zero. + bool is_zero = (address[i] == 0 && address[i + 1] == 0); + + if (is_zero) { + // Add the zero to the current range (or start a new one). + if (!cur_range.is_valid()) + cur_range = Component(i, 0); + cur_range.len += 2; + } + + if (!is_zero || i == 14) { + // Just completed a run of zeros. If the run is greater than 16 bits, + // it is a candidate for the contraction. + if (cur_range.len > 2 && cur_range.len > max_range.len) { + max_range = cur_range; + } + cur_range.reset(); + } + } + *contraction_range = max_range; +} + +// Return true if we've made a final IPV6/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template +bool DoCanonicalizeIPv6Address(const CHAR* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + // Turn the IP address into a 128 bit number. + if (!IPv6AddressToNumber(spec, host, host_info->address)) { + // If it's not an IPv6 address, scan for characters that should *only* + // exist in an IPv6 address. + for (int i = host.begin; i < host.end(); i++) { + switch (spec[i]) { + case '[': + case ']': + case ':': + host_info->family = CanonHostInfo::BROKEN; + return true; + } + } + + // No invalid characters. Could still be IPv4 or a hostname. + host_info->family = CanonHostInfo::NEUTRAL; + return false; + } + + host_info->out_host.begin = output->length(); + output->push_back('['); + AppendIPv6Address(host_info->address, output); + output->push_back(']'); + host_info->out_host.len = output->length() - host_info->out_host.begin; + + host_info->family = CanonHostInfo::IPV6; + return true; +} + +} // namespace + +void AppendIPv4Address(const unsigned char address[4], CanonOutput* output) { + for (int i = 0; i < 4; i++) { + char str[16]; + _itoa_s(address[i], str, 10); + + for (int ch = 0; str[ch] != 0; ch++) + output->push_back(str[ch]); + + if (i != 3) + output->push_back('.'); + } +} + +void AppendIPv6Address(const unsigned char address[16], CanonOutput* output) { + // We will output the address according to the rules in: + // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4 + + // Start by finding where to place the "::" contraction (if any). + Component contraction_range; + ChooseIPv6ContractionRange(address, &contraction_range); + + for (int i = 0; i <= 14;) { + // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive. + DCHECK(i % 2 == 0); + if (i == contraction_range.begin && contraction_range.len > 0) { + // Jump over the contraction. + if (i == 0) + output->push_back(':'); + output->push_back(':'); + i = contraction_range.end(); + } else { + // Consume the next 16 bits from |address|. + int x = address[i] << 8 | address[i + 1]; + + i += 2; + + // Stringify the 16 bit number (at most requires 4 hex digits). + char str[5]; + _itoa_s(x, str, 16); + for (int ch = 0; str[ch] != 0; ++ch) + output->push_back(str[ch]); + + // Put a colon after each number, except the last. + if (i < 16) + output->push_back(':'); + } + } +} + +void CanonicalizeIPAddress(const char* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address( + spec, host, output, host_info)) + return; + if (DoCanonicalizeIPv6Address( + spec, host, output, host_info)) + return; +} + +void CanonicalizeIPAddress(const char16_t* spec, + const Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address(spec, host, output, + host_info)) + return; + if (DoCanonicalizeIPv6Address(spec, host, output, + host_info)) + return; +} + +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber(spec, host, address, + num_ipv4_components); +} + +CanonHostInfo::Family IPv4AddressToNumber(const char16_t* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber(spec, host, address, + num_ipv4_components); +} + +bool IPv6AddressToNumber(const char* spec, + const Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber(spec, host, address); +} + +bool IPv6AddressToNumber(const char16_t* spec, + const Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber(spec, host, address); +} + +} // namespace url diff --git a/url_canon_ip.h b/url_canon_ip.h new file mode 100644 index 00000000000..86be08a5f14 --- /dev/null +++ b/url_canon_ip.h @@ -0,0 +1,60 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_IP_H_ +#define URL_URL_CANON_IP_H_ + +#include "base/component_export.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" + +namespace url { + +// Writes the given IPv4 address to |output|. +COMPONENT_EXPORT(URL) +void AppendIPv4Address(const unsigned char address[4], CanonOutput* output); + +// Writes the given IPv6 address to |output|. +COMPONENT_EXPORT(URL) +void AppendIPv6Address(const unsigned char address[16], CanonOutput* output); + +// Converts an IPv4 address to a 32-bit number (network byte order). +// +// Possible return values: +// IPV4 - IPv4 address was successfully parsed. +// BROKEN - Input was formatted like an IPv4 address, but overflow occurred +// during parsing. +// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address. +// It might be an IPv6 address, or a hostname. +// +// On success, |num_ipv4_components| will be populated with the number of +// components in the IPv4 address. +COMPONENT_EXPORT(URL) +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components); +COMPONENT_EXPORT(URL) +CanonHostInfo::Family IPv4AddressToNumber(const char16_t* spec, + const Component& host, + unsigned char address[4], + int* num_ipv4_components); + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +// +// NOTE that |host| is expected to be surrounded by square brackets. +// i.e. "[::1]" rather than "::1". +COMPONENT_EXPORT(URL) +bool IPv6AddressToNumber(const char* spec, + const Component& host, + unsigned char address[16]); +COMPONENT_EXPORT(URL) +bool IPv6AddressToNumber(const char16_t* spec, + const Component& host, + unsigned char address[16]); + +} // namespace url + +#endif // URL_URL_CANON_IP_H_ diff --git a/url_canon_mailtourl.cc b/url_canon_mailtourl.cc new file mode 100644 index 00000000000..e48b6422f8a --- /dev/null +++ b/url_canon_mailtourl.cc @@ -0,0 +1,127 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "mailto:" URLs. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +namespace url { + +namespace { + +// Certain characters should be percent-encoded when they appear in the path +// component of a mailto URL, to improve compatibility and mitigate against +// command-injection attacks on mailto handlers. See https://crbug.com/711020. +template +bool ShouldEncodeMailboxCharacter(UCHAR uch) { + if (uch < 0x21 || // space & control characters. + uch > 0x7e || // high-ascii characters. + uch == 0x22 || // quote. + uch == 0x3c || uch == 0x3e || // angle brackets. + uch == 0x60 || // backtick. + uch == 0x7b || uch == 0x7c || uch == 0x7d // braces and pipe. + ) { + return true; + } + return false; +} + +template +bool DoCanonicalizeMailtoURL(const URLComponentSource& source, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + // mailto: only uses {scheme, path, query} -- clear the rest. + new_parsed->username = Component(); + new_parsed->password = Component(); + new_parsed->host = Component(); + new_parsed->port = Component(); + new_parsed->ref = Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("mailto:", 7); + new_parsed->scheme.len = 6; + + bool success = true; + + // Path + if (parsed.path.is_valid()) { + new_parsed->path.begin = output->length(); + + // Copy the path using path URL's more lax escaping rules. + // We convert to UTF-8 and escape non-ASCII, but leave most + // ASCII characters alone. + size_t end = static_cast(parsed.path.end()); + for (size_t i = static_cast(parsed.path.begin); i < end; ++i) { + UCHAR uch = static_cast(source.path[i]); + if (ShouldEncodeMailboxCharacter(uch)) + success &= AppendUTF8EscapedChar(source.path, &i, end, output); + else + output->push_back(static_cast(uch)); + } + + new_parsed->path.len = output->length() - new_parsed->path.begin; + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query -- always use the default UTF8 charset converter. + CanonicalizeQuery(source.query, parsed.query, NULL, + output, &new_parsed->query); + + return success; +} + +} // namespace + +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeMailtoURL( + URLComponentSource(spec), parsed, output, new_parsed); +} + +bool CanonicalizeMailtoURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeMailtoURL( + URLComponentSource(spec), parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeMailtoURL( + source, parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeMailtoURL( + source, parsed, output, new_parsed); +} + +} // namespace url diff --git a/url_canon_path.cc b/url_canon_path.cc new file mode 100644 index 00000000000..676468d5dfb --- /dev/null +++ b/url_canon_path.cc @@ -0,0 +1,474 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "base/check.h" +#include "base/check_op.h" +#include "third_party/abseil-cpp/absl/types/optional.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_parse_internal.h" + +namespace url { + +namespace { + +enum CharacterFlags { + // Pass through unchanged, whether escaped or unescaped. This doesn't + // actually set anything so you can't OR it to check, it's just to make the + // table below more clear when neither ESCAPE or UNESCAPE is set. + PASS = 0, + + // This character requires special handling in DoPartialPathInternal. Doing + // this test + // first allows us to filter out the common cases of regular characters that + // can be directly copied. + SPECIAL = 1, + + // This character must be escaped in the canonical output. Note that all + // escaped chars also have the "special" bit set so that the code that looks + // for this is triggered. Not valid with PASS or ESCAPE + ESCAPE_BIT = 2, + ESCAPE = ESCAPE_BIT | SPECIAL, + + // This character must be unescaped in canonical output. Not valid with + // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these + // characters unescaped, they should just be copied. + UNESCAPE = 4, + + // This character is disallowed in URLs. Note that the "special" bit is also + // set to trigger handling. + INVALID_BIT = 8, + INVALID = INVALID_BIT | SPECIAL, +}; + +// This table contains one of the above flag values. Note some flags are more +// than one bits because they also turn on the "special" flag. Special is the +// only flag that may be combined with others. +// +// This table is designed to match exactly what IE does with the characters. +// +// Dot is even more special, and the escaped version is handled specially by +// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape" +// bit is never handled (we just need the "special") bit. +const unsigned char kPathCharLookup[0x100] = { +// NULL control chars... + INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// control chars... + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// ' ' ! " # $ % & ' ( ) * + , - . / + ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE, +// @ A B C D E F G H I J K L M N O + PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// P Q R S T U V W X Y Z [ \ ] ^ _ + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE, +// ` a b c d e f g h i j k l m n o + ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// p q r s t u v w x y z { | } ~ + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE, +// ...all the high-bit characters are escaped + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE}; + +enum DotDisposition { + // The given dot is just part of a filename and is not special. + NOT_A_DIRECTORY, + + // The given dot is the current directory. + DIRECTORY_CUR, + + // The given dot is the first of a double dot that should take us up one. + DIRECTORY_UP +}; + +// When the path resolver finds a dot, this function is called with the +// character following that dot to see what it is. The return value +// indicates what type this dot is (see above). This code handles the case +// where the dot is at the end of the input. +// +// |*consumed_len| will contain the number of characters in the input that +// express what we found. +// +// If the input is "../foo", |after_dot| = 1, |end| = 6, and +// at the end, |*consumed_len| = 2 for the "./" this function consumed. The +// original dot length should be handled by the caller. +template +DotDisposition ClassifyAfterDot(const CHAR* spec, + size_t after_dot, + size_t end, + size_t* consumed_len) { + if (after_dot == end) { + // Single dot at the end. + *consumed_len = 0; + return DIRECTORY_CUR; + } + if (IsURLSlash(spec[after_dot])) { + // Single dot followed by a slash. + *consumed_len = 1; // Consume the slash + return DIRECTORY_CUR; + } + + size_t second_dot_len = IsDot(spec, after_dot, end); + if (second_dot_len) { + size_t after_second_dot = after_dot + second_dot_len; + if (after_second_dot == end) { + // Double dot at the end. + *consumed_len = second_dot_len; + return DIRECTORY_UP; + } + if (IsURLSlash(spec[after_second_dot])) { + // Double dot followed by a slash. + *consumed_len = second_dot_len + 1; + return DIRECTORY_UP; + } + } + + // The dots are followed by something else, not a directory. + *consumed_len = 0; + return NOT_A_DIRECTORY; +} + +// Rewinds the output to the previous slash. It is assumed that the output +// ends with a slash and this doesn't count (we call this when we are +// appending directory paths, so the previous path component has and ending +// slash). +// +// This will stop at the first slash (assumed to be at position +// |path_begin_in_output| and not go any higher than that. Some web pages +// do ".." too many times, so we need to handle that brokenness. +// +// It searches for a literal slash rather than including a backslash as well +// because it is run only on the canonical output. +// +// The output is guaranteed to end in a slash when this function completes. +void BackUpToPreviousSlash(size_t path_begin_in_output, CanonOutput* output) { + CHECK(output->length() > 0); + CHECK(path_begin_in_output < output->length()); + + size_t i = output->length() - 1; + DCHECK(output->at(i) == '/'); + if (i == path_begin_in_output) + return; // We're at the first slash, nothing to do. + + // Now back up (skipping the trailing slash) until we find another slash. + do { + --i; + } while (output->at(i) != '/' && i > path_begin_in_output); + + // Now shrink the output to just include that last slash we found. + output->set_length(i + 1); +} + +// Looks for problematic nested escape sequences and escapes the output as +// needed to ensure they can't be misinterpreted. +// +// Our concern is that in input escape sequence that's invalid because it +// contains nested escape sequences might look valid once those are unescaped. +// For example, "%%300" is not a valid escape sequence, but after unescaping the +// inner "%30" this becomes "%00" which is valid. Leaving this in the output +// string can result in callers re-canonicalizing the string and unescaping this +// sequence, thus resulting in something fundamentally different than the +// original input here. This can cause a variety of problems. +// +// This function is called after we've just unescaped a sequence that's within +// two output characters of a previous '%' that we know didn't begin a valid +// escape sequence in the input string. We look for whether the output is going +// to turn into a valid escape sequence, and if so, convert the initial '%' into +// an escaped "%25" so the output can't be misinterpreted. +// +// |spec| is the input string we're canonicalizing. +// |next_input_index| is the index of the next unprocessed character in |spec|. +// |input_len| is the length of |spec|. +// |last_invalid_percent_index| is the index in |output| of a previously-seen +// '%' character. The caller knows this '%' character isn't followed by a valid +// escape sequence in the input string. +// |output| is the canonicalized output thus far. The caller guarantees this +// ends with a '%' followed by one or two characters, and the '%' is the one +// pointed to by |last_invalid_percent_index|. The last character in the string +// was just unescaped. +template +void CheckForNestedEscapes(const CHAR* spec, + size_t next_input_index, + size_t input_len, + size_t last_invalid_percent_index, + CanonOutput* output) { + const size_t length = output->length(); + const char last_unescaped_char = output->at(length - 1); + + // If |output| currently looks like "%c", we need to try appending the next + // input character to see if this will result in a problematic escape + // sequence. Note that this won't trigger on the first nested escape of a + // two-escape sequence like "%%30%30" -- we'll allow the conversion to + // "%0%30" -- but the second nested escape will be caught by this function + // when it's called again in that case. + const bool append_next_char = last_invalid_percent_index == length - 2; + if (append_next_char) { + // If the input doesn't contain a 7-bit character next, this case won't be a + // problem. + if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80)) + return; + output->push_back(static_cast(spec[next_input_index])); + } + + // Now output ends like "%cc". Try to unescape this. + size_t begin = last_invalid_percent_index; + unsigned char temp; + if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) { + // New escape sequence found. Overwrite the characters following the '%' + // with "25", and push_back() the one or two characters that were following + // the '%' when we were called. + if (!append_next_char) + output->push_back(output->at(last_invalid_percent_index + 1)); + output->set(last_invalid_percent_index + 1, '2'); + output->set(last_invalid_percent_index + 2, '5'); + output->push_back(last_unescaped_char); + } else if (append_next_char) { + // Not a valid escape sequence, but we still need to undo appending the next + // source character so the caller can process it normally. + output->set_length(length); + } +} + +// Canonicalizes and appends the given path to the output. It assumes that if +// the input path starts with a slash, it should be copied to the output. +// +// If there are already path components (this mode is used when appending +// relative paths for resolving), it assumes that the output already has +// a trailing slash and that if the input begins with a slash, it should be +// copied to the output. +// +// We do not collapse multiple slashes in a row to a single slash. It seems +// no web browsers do this, and we don't want incompatibilities, even though +// it would be correct for most systems. +template +bool DoPartialPathInternal(const CHAR* spec, + const Component& path, + size_t path_begin_in_output, + CanonOutput* output) { + if (path.is_empty()) + return true; + + size_t end = static_cast(path.end()); + + // We use this variable to minimize the amount of work done when unescaping -- + // we'll only call CheckForNestedEscapes() when this points at one of the last + // couple of characters in |output|. + absl::optional last_invalid_percent_index; + + bool success = true; + for (size_t i = static_cast(path.begin); i < end; i++) { + UCHAR uch = static_cast(spec[i]); + if (sizeof(CHAR) > 1 && uch >= 0x80) { + // We only need to test wide input for having non-ASCII characters. For + // narrow input, we'll always just use the lookup table. We don't try to + // do anything tricky with decoding/validating UTF-8. This function will + // read one or two UTF-16 characters and append the output as UTF-8. This + // call will be removed in 8-bit mode. + success &= AppendUTF8EscapedChar(spec, &i, end, output); + } else { + // Normal ASCII character or 8-bit input, use the lookup table. + unsigned char out_ch = static_cast(uch); + unsigned char flags = kPathCharLookup[out_ch]; + if (flags & SPECIAL) { + // Needs special handling of some sort. + size_t dotlen; + if ((dotlen = IsDot(spec, i, end)) > 0) { + // See if this dot was preceded by a slash in the output. + // + // Note that we check this in the case of dots so we don't have to + // special case slashes. Since slashes are much more common than + // dots, this actually increases performance measurably (though + // slightly). + if (output->length() > path_begin_in_output && + output->at(output->length() - 1) == '/') { + // Slash followed by a dot, check to see if this is means relative + size_t consumed_len; + switch (ClassifyAfterDot(spec, i + dotlen, end, + &consumed_len)) { + case NOT_A_DIRECTORY: + // Copy the dot to the output, it means nothing special. + output->push_back('.'); + i += dotlen - 1; + break; + case DIRECTORY_CUR: // Current directory, just skip the input. + i += dotlen + consumed_len - 1; + break; + case DIRECTORY_UP: + BackUpToPreviousSlash(path_begin_in_output, output); + if (last_invalid_percent_index >= output->length()) { + last_invalid_percent_index = absl::nullopt; + } + i += dotlen + consumed_len - 1; + break; + } + } else { + // This dot is not preceded by a slash, it is just part of some + // file name. + output->push_back('.'); + i += dotlen - 1; + } + + } else if (out_ch == '\\') { + // Convert backslashes to forward slashes + output->push_back('/'); + + } else if (out_ch == '%') { + // Handle escape sequences. + unsigned char unescaped_value; + if (DecodeEscaped(spec, &i, end, &unescaped_value)) { + // Valid escape sequence, see if we keep, reject, or unescape it. + // Note that at this point DecodeEscape() will have advanced |i| to + // the last character of the escape sequence. + char unescaped_flags = kPathCharLookup[unescaped_value]; + + if (unescaped_flags & UNESCAPE) { + // This escaped value shouldn't be escaped. Try to copy it. + output->push_back(unescaped_value); + // If we just unescaped a value within 2 output characters of the + // '%' from a previously-detected invalid escape sequence, we + // might have an input string with problematic nested escape + // sequences; detect and fix them. + if (last_invalid_percent_index.has_value() && + ((last_invalid_percent_index.value() + 3) >= + output->length())) { + CheckForNestedEscapes(spec, i + 1, end, + last_invalid_percent_index.value(), + output); + } + } else { + // Either this is an invalid escaped character, or it's a valid + // escaped character we should keep escaped. In the first case we + // should just copy it exactly and remember the error. In the + // second we also copy exactly in case the server is sensitive to + // changing the case of any hex letters. + output->push_back('%'); + output->push_back(static_cast(spec[i - 1])); + output->push_back(static_cast(spec[i])); + if (unescaped_flags & INVALID_BIT) + success = false; + } + } else { + // Invalid escape sequence. IE7+ rejects any URLs with such + // sequences, while other browsers pass them through unchanged. We + // use the permissive behavior. + // TODO(brettw): Consider testing IE's strict behavior, which would + // allow removing the code to handle nested escapes above. + last_invalid_percent_index = output->length(); + output->push_back('%'); + } + + } else if (flags & INVALID_BIT) { + // For NULLs, etc. fail. + AppendEscapedChar(out_ch, output); + success = false; + + } else if (flags & ESCAPE_BIT) { + // This character should be escaped. + AppendEscapedChar(out_ch, output); + } + } else { + // Nothing special about this character, just append it. + output->push_back(out_ch); + } + } + } + return success; +} + +// Perform the same logic as in DoPartialPathInternal(), but updates the +// publicly exposed CanonOutput structure similar to DoPath(). Returns +// true if successful. +template +bool DoPartialPath(const CHAR* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + out_path->begin = output->length(); + bool success = + DoPartialPathInternal(spec, path, out_path->begin, output); + out_path->len = output->length() - out_path->begin; + return success; +} + +template +bool DoPath(const CHAR* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + bool success = true; + out_path->begin = output->length(); + if (path.is_nonempty()) { + // Write out an initial slash if the input has none. If we just parse a URL + // and then canonicalize it, it will of course have a slash already. This + // check is for the replacement and relative URL resolving cases of file + // URLs. + if (!IsURLSlash(spec[path.begin])) + output->push_back('/'); + + success = + DoPartialPathInternal(spec, path, out_path->begin, output); + } else { + // No input, canonical path is a slash. + output->push_back('/'); + } + out_path->len = output->length() - out_path->begin; + return success; +} + +} // namespace + +bool CanonicalizePath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoPath(spec, path, output, out_path); +} + +bool CanonicalizePath(const char16_t* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoPath(spec, path, output, out_path); +} + +bool CanonicalizePartialPath(const char* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoPartialPath(spec, path, output, out_path); +} + +bool CanonicalizePartialPath(const char16_t* spec, + const Component& path, + CanonOutput* output, + Component* out_path) { + return DoPartialPath(spec, path, output, out_path); +} + +bool CanonicalizePartialPathInternal(const char* spec, + const Component& path, + size_t path_begin_in_output, + CanonOutput* output) { + return DoPartialPathInternal( + spec, path, path_begin_in_output, output); +} + +bool CanonicalizePartialPathInternal(const char16_t* spec, + const Component& path, + size_t path_begin_in_output, + CanonOutput* output) { + return DoPartialPathInternal( + spec, path, path_begin_in_output, output); +} + +} // namespace url diff --git a/url_canon_pathurl.cc b/url_canon_pathurl.cc new file mode 100644 index 00000000000..85983a8c3a7 --- /dev/null +++ b/url_canon_pathurl.cc @@ -0,0 +1,144 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions for canonicalizing "path" URLs. Not to be confused with the path +// of a URL, these are URLs that have no authority section, only a path. For +// example, "javascript:" and "data:". + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace { + +// Canonicalize the given |component| from |source| into |output| and +// |new_component|. If |separator| is non-zero, it is pre-pended to |output| +// prior to the canonicalized component; i.e. for the '?' or '#' characters. +template +void DoCanonicalizePathComponent(const CHAR* source, + const Component& component, + char separator, + CanonOutput* output, + Component* new_component) { + if (component.is_valid()) { + if (separator) + output->push_back(separator); + // Copy the path using path URL's more lax escaping rules (think for + // javascript:). We convert to UTF-8 and escape characters from the + // C0 control percent-encode set, but leave all other characters alone. + // This helps readability of JavaScript. + // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state + // https://url.spec.whatwg.org/#c0-control-percent-encode-set + new_component->begin = output->length(); + size_t end = static_cast(component.end()); + for (size_t i = static_cast(component.begin); i < end; i++) { + UCHAR uch = static_cast(source[i]); + if (uch < 0x20 || uch > 0x7E) + AppendUTF8EscapedChar(source, &i, end, output); + else + output->push_back(static_cast(uch)); + } + new_component->len = output->length() - new_component->begin; + } else { + // Empty part. + new_component->reset(); + } +} + +template +bool DoCanonicalizePathURL(const URLComponentSource& source, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + // We assume there's no authority for path URLs. Note that hosts should never + // have -1 length. + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->host.reset(); + new_parsed->port.reset(); + + // Canonicalize path via the weaker path URL rules. + // + // Note: parsing the path part should never cause a failure, see + // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state + DoCanonicalizePathComponent(source.path, parsed.path, '\0', + output, &new_parsed->path); + + // Similar to mailto:, always use the default UTF-8 charset converter for + // query. + CanonicalizeQuery(source.query, parsed.query, nullptr, output, + &new_parsed->query); + + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + +bool CanonicalizePathURL(const char* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizePathURL( + URLComponentSource(spec), parsed, output, new_parsed); +} + +bool CanonicalizePathURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizePathURL( + URLComponentSource(spec), parsed, output, new_parsed); +} + +void CanonicalizePathURLPath(const char* source, + const Component& component, + CanonOutput* output, + Component* new_component) { + DoCanonicalizePathComponent(source, component, '\0', + output, new_component); +} + +void CanonicalizePathURLPath(const char16_t* source, + const Component& component, + CanonOutput* output, + Component* new_component) { + DoCanonicalizePathComponent(source, component, '\0', + output, new_component); +} + +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizePathURL( + source, parsed, output, new_parsed); +} + +bool ReplacePathURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizePathURL( + source, parsed, output, new_parsed); +} + +} // namespace url diff --git a/url_canon_query.cc b/url_canon_query.cc new file mode 100644 index 00000000000..47d20d1ffb6 --- /dev/null +++ b/url_canon_query.cc @@ -0,0 +1,149 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" + +// Query canonicalization in IE +// ---------------------------- +// IE is very permissive for query parameters specified in links on the page +// (in contrast to links that it constructs itself based on form data). It does +// not unescape any character. It does not reject any escape sequence (be they +// invalid like "%2y" or freaky like %00). +// +// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09), +// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier +// layer since they are removed from all portions of the URL). All other +// characters are passed unmodified. Invalid UTF-16 sequences are preserved as +// well, with each character in the input being converted to UTF-8. It is the +// server's job to make sense of this invalid query. +// +// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page) +// are converted to the invalid character and sent as unescaped UTF-8 (0xef, +// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these +// strings before the URL handler ever sees them. +// +// Our query canonicalization +// -------------------------- +// We escape all non-ASCII characters and control characters, like Firefox. +// This is more conformant to the URL spec, and there do not seem to be many +// problems relating to Firefox's behavior. +// +// Like IE, we will never unescape (although the application may want to try +// unescaping to present the user with a more understandable URL). We will +// replace all invalid sequences (including invalid UTF-16 sequences, which IE +// doesn't) with the "invalid character," and we will escape it. + +namespace url { + +namespace { + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. This version will accept 8 or 16 +// bit characters, but assumes that they have only 7-bit values. It also assumes +// that all UTF-8 values are correct, so doesn't bother checking +template +void AppendRaw8BitQueryString(const CHAR* source, int length, + CanonOutput* output) { + for (int i = 0; i < length; i++) { + if (!IsQueryChar(static_cast(source[i]))) + AppendEscapedChar(static_cast(source[i]), output); + else // Doesn't need escaping. + output->push_back(static_cast(source[i])); + } +} + +// Runs the converter on the given UTF-8 input. Since the converter expects +// UTF-16, we have to convert first. The converter must be non-NULL. +void RunConverter(const char* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + DCHECK(query.is_valid()); + // This function will replace any misencoded values with the invalid + // character. This is what we want so we don't have to check for error. + RawCanonOutputW<1024> utf16; + ConvertUTF8ToUTF16(&spec[query.begin], static_cast(query.len), + &utf16); + converter->ConvertFromUTF16(utf16.data(), utf16.length(), output); +} + +// Runs the converter with the given UTF-16 input. We don't have to do +// anything, but this overridden function allows us to use the same code +// for both UTF-8 and UTF-16 input. +void RunConverter(const char16_t* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + DCHECK(query.is_valid()); + converter->ConvertFromUTF16(&spec[query.begin], + static_cast(query.len), output); +} + +template +void DoConvertToQueryEncoding(const CHAR* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + if (converter) { + // Run the converter to get an 8-bit string, then append it, escaping + // necessary values. + RawCanonOutput<1024> eight_bit; + RunConverter(spec, query, converter, &eight_bit); + AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output); + + } else { + // No converter, do our own UTF-8 conversion. + AppendStringOfType(&spec[query.begin], static_cast(query.len), + CHAR_QUERY, output); + } +} + +template +void DoCanonicalizeQuery(const CHAR* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query) { + if (!query.is_valid()) { + *out_query = Component(); + return; + } + + output->push_back('?'); + out_query->begin = output->length(); + + DoConvertToQueryEncoding(spec, query, converter, output); + + out_query->len = output->length() - out_query->begin; +} + +} // namespace + +void CanonicalizeQuery(const char* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query) { + DoCanonicalizeQuery(spec, query, converter, + output, out_query); +} + +void CanonicalizeQuery(const char16_t* spec, + const Component& query, + CharsetConverter* converter, + CanonOutput* output, + Component* out_query) { + DoCanonicalizeQuery(spec, query, converter, output, + out_query); +} + +void ConvertUTF16ToQueryEncoding(const char16_t* input, + const Component& query, + CharsetConverter* converter, + CanonOutput* output) { + DoConvertToQueryEncoding(input, query, converter, output); +} + +} // namespace url diff --git a/url_canon_relative.cc b/url_canon_relative.cc new file mode 100644 index 00000000000..d8ea528a25a --- /dev/null +++ b/url_canon_relative.cc @@ -0,0 +1,623 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Canonicalizer functions for working with and resolving relative URLs. + +#include +#include + +#include "base/check_op.h" +#include "base/strings/string_util.h" +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_constants.h" +#include "url/url_features.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" +#include "url/url_util.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug +// 379034), whereas IE is case-insensitive. +// +// We choose to be more permissive like IE. We don't need to worry about +// unescaping or anything here: neither IE or Firefox allow this. We also +// don't have to worry about invalid scheme characters since we are comparing +// against the canonical scheme of the base. +// +// The base URL should always be canonical, therefore it should be ASCII. +template +bool AreSchemesEqual(const char* base, + const Component& base_scheme, + const CHAR* cmp, + const Component& cmp_scheme) { + if (base_scheme.len != cmp_scheme.len) + return false; + for (int i = 0; i < base_scheme.len; i++) { + // We assume the base is already canonical, so we don't have to + // canonicalize it. + if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) != + base[base_scheme.begin + i]) + return false; + } + return true; +} + +#ifdef WIN32 + +// Here, we also allow Windows paths to be represented as "/C:/" so we can be +// consistent about URL paths beginning with slashes. This function is like +// DoesBeginWindowsDrivePath except that it also requires a slash at the +// beginning. +template +bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + if (start_offset >= spec_len) + return false; + return IsURLSlash(spec[start_offset]) && + DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len); +} + +#endif // WIN32 + +template +bool IsValidScheme(const CHAR* url, const Component& scheme) { + // Caller should ensure that the |scheme| is not empty. + DCHECK_NE(0, scheme.len); + + // From https://url.spec.whatwg.org/#scheme-start-state: + // scheme start state: + // 1. If c is an ASCII alpha, append c, lowercased, to buffer, and set + // state to scheme state. + // 2. Otherwise, if state override is not given, set state to no scheme + // state, and decrease pointer by one. + // 3. Otherwise, validation error, return failure. + // Note that both step 2 and step 3 mean that the scheme was not valid. + if (!base::IsAsciiAlpha(url[scheme.begin])) + return false; + + // From https://url.spec.whatwg.org/#scheme-state: + // scheme state: + // 1. If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E + // (.), append c, lowercased, to buffer. + // 2. Otherwise, if c is U+003A (:), then [...] + // + // We begin at |scheme.begin + 1|, because the character at |scheme.begin| has + // already been checked by base::IsAsciiAlpha above. + int scheme_end = scheme.end(); + for (int i = scheme.begin + 1; i < scheme_end; i++) { + if (!CanonicalSchemeChar(url[i])) + return false; + } + + return true; +} + +// See IsRelativeURL in the header file for usage. +template +bool DoIsRelativeURL(const char* base, + const Parsed& base_parsed, + const CHAR* url, + int url_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component) { + *is_relative = false; // So we can default later to not relative. + + // Trim whitespace and construct a new range for the substring. + int begin = 0; + TrimURL(url, &begin, &url_len); + if (begin >= url_len) { + // Empty URLs are relative, but do nothing. + if (!is_base_hierarchical) { + // Don't allow relative URLs if the base scheme doesn't support it. + return false; + } + *relative_component = Component(begin, 0); + *is_relative = true; + return true; + } + +#ifdef WIN32 + // We special case paths like "C:\foo" so they can link directly to the + // file on Windows (IE compatibility). The security domain stuff should + // prevent a link like this from actually being followed if its on a + // web page. + // + // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/" + // as relative, as this will just replace the path when the base scheme + // is a file and the answer will still be correct. + // + // We require strict backslashes when detecting UNC since two forward + // slashes should be treated a a relative URL with a hostname. + if (DoesBeginWindowsDriveSpec(url, begin, url_len) || + DoesBeginUNCPath(url, begin, url_len, true)) + return true; +#endif // WIN32 + + // See if we've got a scheme, if not, we know this is a relative URL. + // BUT, just because we have a scheme, doesn't make it absolute. + // "http:foo.html" is a relative URL with path "foo.html". If the scheme is + // empty, we treat it as relative (":foo"), like IE does. + Component scheme; + const bool scheme_is_empty = + !ExtractScheme(url, url_len, &scheme) || scheme.len == 0; + if (scheme_is_empty) { + if (url[begin] == '#') { + // |url| is a bare fragment (e.g. "#foo"). This can be resolved against + // any base. Fall-through. + } else if (!is_base_hierarchical) { + // Don't allow relative URLs if the base scheme doesn't support it. + return false; + } + + *relative_component = MakeRange(begin, url_len); + *is_relative = true; + return true; + } + + // If the scheme isn't valid, then it's relative. + if (!IsValidScheme(url, scheme)) { + if (url[begin] == '#' && + base::FeatureList::IsEnabled( + kResolveBareFragmentWithColonOnNonHierarchical)) { + // |url| is a bare fragment (e.g. "#foo:bar"). This can be resolved + // against any base. Fall-through. + } else if (!is_base_hierarchical) { + // Don't allow relative URLs if the base scheme doesn't support it. + return false; + } + *relative_component = MakeRange(begin, url_len); + *is_relative = true; + return true; + } + + // If the scheme is not the same, then we can't count it as relative. + if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme)) + return true; + + // When the scheme that they both share is not hierarchical, treat the + // incoming scheme as absolute (this way with the base of "data:foo", + // "data:bar" will be reported as absolute. + if (!is_base_hierarchical) + return true; + + int colon_offset = scheme.end(); + + // If it's a filesystem URL, the only valid way to make it relative is not to + // supply a scheme. There's no equivalent to e.g. http:index.html. + if (CompareSchemeComponent(url, scheme, kFileSystemScheme)) + return true; + + // ExtractScheme guarantees that the colon immediately follows what it + // considers to be the scheme. CountConsecutiveSlashes will handle the + // case where the begin offset is the end of the input. + int num_slashes = CountConsecutiveSlashes(url, colon_offset + 1, url_len); + + if (num_slashes == 0 || num_slashes == 1) { + // No slashes means it's a relative path like "http:foo.html". One slash + // is an absolute path. "http:/home/foo.html" + *is_relative = true; + *relative_component = MakeRange(colon_offset + 1, url_len); + return true; + } + + // Two or more slashes after the scheme we treat as absolute. + return true; +} + +// Copies all characters in the range [begin, end) of |spec| to the output, +// up until and including the last slash. There should be a slash in the +// range, if not, nothing will be copied. +// +// For stardard URLs the input should be canonical, but when resolving relative +// URLs on a non-standard base (like "data:") the input can be anything. +void CopyToLastSlash(const char* spec, + int begin, + int end, + CanonOutput* output) { + // Find the last slash. + int last_slash = -1; + for (int i = end - 1; i >= begin; i--) { + if (spec[i] == '/' || spec[i] == '\\') { + last_slash = i; + break; + } + } + if (last_slash < 0) + return; // No slash. + + // Copy. + for (int i = begin; i <= last_slash; i++) + output->push_back(spec[i]); +} + +// Copies a single component from the source to the output. This is used +// when resolving relative URLs and a given component is unchanged. Since the +// source should already be canonical, we don't have to do anything special, +// and the input is ASCII. +void CopyOneComponent(const char* source, + const Component& source_component, + CanonOutput* output, + Component* output_component) { + if (!source_component.is_valid()) { + // This component is not present. + *output_component = Component(); + return; + } + + output_component->begin = output->length(); + int source_end = source_component.end(); + for (int i = source_component.begin; i < source_end; i++) + output->push_back(source[i]); + output_component->len = output->length() - output_component->begin; +} + +#ifdef WIN32 + +// Called on Windows when the base URL is a file URL, this will copy the "C:" +// to the output, if there is a drive letter and if that drive letter is not +// being overridden by the relative URL. Otherwise, do nothing. +// +// It will return the index of the beginning of the next character in the +// base to be processed: if there is a "C:", the slash after it, or if +// there is no drive letter, the slash at the beginning of the path, or +// the end of the base. This can be used as the starting offset for further +// path processing. +template +int CopyBaseDriveSpecIfNecessary(const char* base_url, + int base_path_begin, + int base_path_end, + const CHAR* relative_url, + int path_start, + int relative_url_len, + CanonOutput* output) { + if (base_path_begin >= base_path_end) + return base_path_begin; // No path. + + // If the relative begins with a drive spec, don't do anything. The existing + // drive spec in the base will be replaced. + if (DoesBeginWindowsDriveSpec(relative_url, path_start, relative_url_len)) { + return base_path_begin; // Relative URL path is "C:/foo" + } + + // The path should begin with a slash (as all canonical paths do). We check + // if it is followed by a drive letter and copy it. + if (DoesBeginSlashWindowsDriveSpec(base_url, + base_path_begin, + base_path_end)) { + // Copy the two-character drive spec to the output. It will now look like + // "file:///C:" so the rest of it can be treated like a standard path. + output->push_back('/'); + output->push_back(base_url[base_path_begin + 1]); + output->push_back(base_url[base_path_begin + 2]); + return base_path_begin + 3; + } + + return base_path_begin; +} + +#endif // WIN32 + +// A subroutine of DoResolveRelativeURL, this resolves the URL knowning that +// the input is a relative path or less (query or ref). +template +bool DoResolveRelativePath(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + bool success = true; + + // We know the authority section didn't change, copy it to the output. We + // also know we have a path so can copy up to there. + Component path, query, ref; + ParsePathInternal(relative_url, relative_component, &path, &query, &ref); + + // Canonical URLs always have a path, so we can use that offset. Reserve + // enough room for the base URL, the new path, and some extra bytes for + // possible escaped characters. + output->ReserveSizeIfNeeded(base_parsed.path.begin + + std::max({path.end(), query.end(), ref.end()})); + output->Append(base_url, base_parsed.path.begin); + + if (path.is_nonempty()) { + // The path is replaced or modified. + int true_path_begin = output->length(); + + // For file: URLs on Windows, we don't want to treat the drive letter and + // colon as part of the path for relative file resolution when the + // incoming URL does not provide a drive spec. We save the true path + // beginning so we can fix it up after we are done. + int base_path_begin = base_parsed.path.begin; +#ifdef WIN32 + if (base_is_file) { + base_path_begin = CopyBaseDriveSpecIfNecessary( + base_url, base_parsed.path.begin, base_parsed.path.end(), + relative_url, relative_component.begin, relative_component.end(), + output); + // Now the output looks like either "file://" or "file:///C:" + // and we can start appending the rest of the path. |base_path_begin| + // points to the character in the base that comes next. + } +#endif // WIN32 + + if (IsURLSlash(relative_url[path.begin])) { + // Easy case: the path is an absolute path on the server, so we can + // just replace everything from the path on with the new versions. + // Since the input should be canonical hierarchical URL, we should + // always have a path. + success &= CanonicalizePath(relative_url, path, + output, &out_parsed->path); + } else { + // Relative path, replace the query, and reference. We take the + // original path with the file part stripped, and append the new path. + // The canonicalizer will take care of resolving ".." and "." + size_t path_begin = output->length(); + CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(), + output); + success &= CanonicalizePartialPathInternal(relative_url, path, path_begin, + output); + out_parsed->path = MakeRange(path_begin, output->length()); + + // Copy the rest of the stuff after the path from the relative path. + } + + // Finish with the query and reference part (these can't fail). + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + + // Fix the path beginning to add back the "C:" we may have written above. + out_parsed->path = MakeRange(true_path_begin, out_parsed->path.end()); + return success; + } + + // If we get here, the path is unchanged: copy to output. + CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path); + + if (query.is_valid()) { + // Just the query specified, replace the query and reference (ignore + // failures for refs) + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // If we get here, the query is unchanged: copy to output. Note that the + // range of the query parameter doesn't include the question mark, so we + // have to add it manually if there is a component. + if (base_parsed.query.is_valid()) + output->push_back('?'); + CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query); + + if (ref.is_valid()) { + // Just the reference specified: replace it (ignoring failures). + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // We should always have something to do in this function, the caller checks + // that some component is being replaced. + DCHECK(false) << "Not reached"; + return success; +} + +// Resolves a relative URL that contains a host. Typically, these will +// be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which +// should be kept from the original URL is the scheme. +template +bool DoResolveRelativeHost(const char* base_url, + const Parsed& base_parsed, + const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + // Parse the relative URL, just like we would for anything following a + // scheme. + Parsed relative_parsed; // Everything but the scheme is valid. + ParseAfterScheme(relative_url, relative_component.end(), + relative_component.begin, &relative_parsed); + + // Now we can just use the replacement function to replace all the necessary + // parts of the old URL with the new one. + Replacements replacements; + replacements.SetUsername(relative_url, relative_parsed.username); + replacements.SetPassword(relative_url, relative_parsed.password); + replacements.SetHost(relative_url, relative_parsed.host); + replacements.SetPort(relative_url, relative_parsed.port); + replacements.SetPath(relative_url, relative_parsed.path); + replacements.SetQuery(relative_url, relative_parsed.query); + replacements.SetRef(relative_url, relative_parsed.ref); + + // Length() does not include the old scheme, so make sure to add it from the + // base URL. + output->ReserveSizeIfNeeded( + replacements.components().Length() + + base_parsed.CountCharactersBefore(Parsed::USERNAME, false)); + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (!GetStandardSchemeType(base_url, base_parsed.scheme, &scheme_type)) { + // A path with an authority section gets canonicalized under standard URL + // rules, even though the base was not known to be standard. + scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + } + return ReplaceStandardURL(base_url, base_parsed, replacements, scheme_type, + query_converter, output, out_parsed); +} + +// Resolves a relative URL that happens to be an absolute file path. Examples +// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo". +template +bool DoResolveAbsoluteFile(const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + // Parse the file URL. The file URl parsing function uses the same logic + // as we do for determining if the file is absolute, in which case it will + // not bother to look for a scheme. + Parsed relative_parsed; + ParseFileURL(&relative_url[relative_component.begin], relative_component.len, + &relative_parsed); + + return CanonicalizeFileURL(&relative_url[relative_component.begin], + relative_component.len, relative_parsed, + query_converter, output, out_parsed); +} + +// TODO(brettw) treat two slashes as root like Mozilla for FTP? +template +bool DoResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + // |base_parsed| is the starting point for our output. Since we may have + // removed whitespace from |relative_url| before entering this method, we'll + // carry over the |potentially_dangling_markup| flag. + bool potentially_dangling_markup = out_parsed->potentially_dangling_markup; + *out_parsed = base_parsed; + if (potentially_dangling_markup) + out_parsed->potentially_dangling_markup = true; + + // Sanity check: the input should have a host or we'll break badly below. + // We can only resolve relative URLs with base URLs that have hosts and + // paths (even the default path of "/" is OK). + // + // We allow hosts with no length so we can handle file URLs, for example. + if (base_parsed.path.is_empty()) { + // On error, return the input (resolving a relative URL on a non-relative + // base = the base). + int base_len = base_parsed.Length(); + for (int i = 0; i < base_len; i++) + output->push_back(base_url[i]); + return false; + } + + if (relative_component.is_empty()) { + // Empty relative URL, leave unchanged, only removing the ref component. + int base_len = base_parsed.Length(); + base_len -= base_parsed.ref.len + 1; + out_parsed->ref.reset(); + output->Append(base_url, base_len); + return true; + } + + int num_slashes = CountConsecutiveSlashes( + relative_url, relative_component.begin, relative_component.end()); + +#ifdef WIN32 + // On Windows, two slashes for a file path (regardless of which direction + // they are) means that it's UNC. Two backslashes on any base scheme mean + // that it's an absolute UNC path (we use the base_is_file flag to control + // how strict the UNC finder is). + // + // We also allow Windows absolute drive specs on any scheme (for example + // "c:\foo") like IE does. There must be no preceding slashes in this + // case (we reject anything like "/c:/foo") because that should be treated + // as a path. For file URLs, we allow any number of slashes since that would + // be setting the path. + // + // This assumes the absolute path resolver handles absolute URLs like this + // properly. DoCanonicalize does this. + int after_slashes = relative_component.begin + num_slashes; + if (DoesBeginUNCPath(relative_url, relative_component.begin, + relative_component.end(), !base_is_file) || + ((num_slashes == 0 || base_is_file) && + DoesBeginWindowsDriveSpec( + relative_url, after_slashes, relative_component.end()))) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#else + // Other platforms need explicit handling for file: URLs with multiple + // slashes because the generic scheme parsing always extracts a host, but a + // file: URL only has a host if it has exactly 2 slashes. Even if it does + // have a host, we want to use the special host detection logic for file + // URLs provided by DoResolveAbsoluteFile(), as opposed to the generic host + // detection logic, for consistency with parsing file URLs from scratch. + if (base_is_file && num_slashes >= 2) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#endif + + // Any other double-slashes mean that this is relative to the scheme. + if (num_slashes >= 2) { + return DoResolveRelativeHost(base_url, base_parsed, + relative_url, relative_component, + query_converter, output, out_parsed); + } + + // When we get here, we know that the relative URL is on the same host. + return DoResolveRelativePath(base_url, base_parsed, base_is_file, + relative_url, relative_component, + query_converter, output, out_parsed); +} + +} // namespace + +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component) { + return DoIsRelativeURL( + base, base_parsed, fragment, fragment_len, is_base_hierarchical, + is_relative, relative_component); +} + +bool IsRelativeURL(const char* base, + const Parsed& base_parsed, + const char16_t* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + Component* relative_component) { + return DoIsRelativeURL(base, base_parsed, fragment, fragment_len, + is_base_hierarchical, is_relative, + relative_component); +} + +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoResolveRelativeURL( + base_url, base_parsed, base_is_file, relative_url, + relative_component, query_converter, output, out_parsed); +} + +bool ResolveRelativeURL(const char* base_url, + const Parsed& base_parsed, + bool base_is_file, + const char16_t* relative_url, + const Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoResolveRelativeURL(base_url, base_parsed, base_is_file, + relative_url, relative_component, + query_converter, output, out_parsed); +} + +} // namespace url diff --git a/url_canon_stdstring.cc b/url_canon_stdstring.cc new file mode 100644 index 00000000000..60e2a26747c --- /dev/null +++ b/url_canon_stdstring.cc @@ -0,0 +1,30 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon_stdstring.h" + +namespace url { + +StdStringCanonOutput::StdStringCanonOutput(std::string* str) : str_(str) { + cur_len_ = str_->size(); // Append to existing data. + buffer_ = str_->empty() ? nullptr : &(*str_)[0]; + buffer_len_ = str_->size(); +} + +StdStringCanonOutput::~StdStringCanonOutput() { + // Nothing to do, we don't own the string. +} + +void StdStringCanonOutput::Complete() { + str_->resize(cur_len_); + buffer_len_ = cur_len_; +} + +void StdStringCanonOutput::Resize(size_t sz) { + str_->resize(sz); + buffer_ = str_->empty() ? nullptr : &(*str_)[0]; + buffer_len_ = sz; +} + +} // namespace url diff --git a/url_canon_stdstring.h b/url_canon_stdstring.h new file mode 100644 index 00000000000..528f91f2f10 --- /dev/null +++ b/url_canon_stdstring.h @@ -0,0 +1,132 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CANON_STDSTRING_H_ +#define URL_URL_CANON_STDSTRING_H_ + +// This header file defines a canonicalizer output method class for STL +// strings. Because the canonicalizer tries not to be dependent on the STL, +// we have segregated it here. + +#include + +#include "base/compiler_specific.h" +#include "base/component_export.h" +#include "base/memory/raw_ptr_exclusion.h" +#include "base/strings/string_piece.h" +#include "url/url_canon.h" + +namespace url { + +// Write into a std::string given in the constructor. This object does not own +// the string itself, and the user must ensure that the string stays alive +// throughout the lifetime of this object. +// +// The given string will be appended to; any existing data in the string will +// be preserved. +// +// Note that when canonicalization is complete, the string will likely have +// unused space at the end because we make the string very big to start out +// with (by |initial_size|). This ends up being important because resize +// operations are slow, and because the base class needs to write directly +// into the buffer. +// +// Therefore, the user should call Complete() before using the string that +// this class wrote into. +class COMPONENT_EXPORT(URL) StdStringCanonOutput : public CanonOutput { + public: + StdStringCanonOutput(std::string* str); + + StdStringCanonOutput(const StdStringCanonOutput&) = delete; + StdStringCanonOutput& operator=(const StdStringCanonOutput&) = delete; + + ~StdStringCanonOutput() override; + + // Must be called after writing has completed but before the string is used. + void Complete(); + + void Resize(size_t sz) override; + + protected: + // `str_` is not a raw_ptr<...> for performance reasons (based on analysis of + // sampling profiler data and tab_search:top100:2020). + RAW_PTR_EXCLUSION std::string* str_; +}; + +// An extension of the Replacements class that allows the setters to use +// StringPieces (implicitly allowing strings or char*s). +// +// The contents of the StringPieces are not copied and must remain valid until +// the StringPieceReplacements object goes out of scope. +// +// In order to make it harder to misuse the API the setters do not accept rvalue +// references to std::strings. +// Note: Extra const char* overloads are necessary to break ambiguities that +// would otherwise exist for char literals. +template +class StringPieceReplacements : public Replacements { + private: + using StringT = std::basic_string; + using StringPieceT = base::BasicStringPiece; + using ParentT = Replacements; + using SetterFun = void (ParentT::*)(const CharT*, const Component&); + + void SetImpl(SetterFun fun, StringPieceT str) { + (this->*fun)(str.data(), Component(0, static_cast(str.size()))); + } + + public: + void SetSchemeStr(const CharT* str) { SetImpl(&ParentT::SetScheme, str); } + void SetSchemeStr(StringPieceT str) { SetImpl(&ParentT::SetScheme, str); } + void SetSchemeStr(const StringT&&) = delete; + + void SetUsernameStr(const CharT* str) { SetImpl(&ParentT::SetUsername, str); } + void SetUsernameStr(StringPieceT str) { SetImpl(&ParentT::SetUsername, str); } + void SetUsernameStr(const StringT&&) = delete; + using ParentT::ClearUsername; + + void SetPasswordStr(const CharT* str) { SetImpl(&ParentT::SetPassword, str); } + void SetPasswordStr(StringPieceT str) { SetImpl(&ParentT::SetPassword, str); } + void SetPasswordStr(const StringT&&) = delete; + using ParentT::ClearPassword; + + void SetHostStr(const CharT* str) { SetImpl(&ParentT::SetHost, str); } + void SetHostStr(StringPieceT str) { SetImpl(&ParentT::SetHost, str); } + void SetHostStr(const StringT&&) = delete; + using ParentT::ClearHost; + + void SetPortStr(const CharT* str) { SetImpl(&ParentT::SetPort, str); } + void SetPortStr(StringPieceT str) { SetImpl(&ParentT::SetPort, str); } + void SetPortStr(const StringT&&) = delete; + using ParentT::ClearPort; + + void SetPathStr(const CharT* str) { SetImpl(&ParentT::SetPath, str); } + void SetPathStr(StringPieceT str) { SetImpl(&ParentT::SetPath, str); } + void SetPathStr(const StringT&&) = delete; + using ParentT::ClearPath; + + void SetQueryStr(const CharT* str) { SetImpl(&ParentT::SetQuery, str); } + void SetQueryStr(StringPieceT str) { SetImpl(&ParentT::SetQuery, str); } + void SetQueryStr(const StringT&&) = delete; + using ParentT::ClearQuery; + + void SetRefStr(const CharT* str) { SetImpl(&ParentT::SetRef, str); } + void SetRefStr(StringPieceT str) { SetImpl(&ParentT::SetRef, str); } + void SetRefStr(const StringT&&) = delete; + using ParentT::ClearRef; + + private: + using ParentT::SetHost; + using ParentT::SetPassword; + using ParentT::SetPath; + using ParentT::SetPort; + using ParentT::SetQuery; + using ParentT::SetRef; + using ParentT::SetScheme; + using ParentT::SetUsername; +}; + +} // namespace url + +#endif // URL_URL_CANON_STDSTRING_H_ diff --git a/url_canon_stdurl.cc b/url_canon_stdurl.cc new file mode 100644 index 00000000000..8096b568bc7 --- /dev/null +++ b/url_canon_stdurl.cc @@ -0,0 +1,209 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Functions to canonicalize "standard" URLs, which are ones that have an +// authority section including a host name. + +#include "url/url_canon.h" +#include "url/url_canon_internal.h" +#include "url/url_constants.h" + +namespace url { + +namespace { + +template +bool DoCanonicalizeStandardURL(const URLComponentSource& source, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + bool scheme_supports_user_info = + (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION); + bool scheme_supports_ports = + (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION || + scheme_type == SCHEME_WITH_HOST_AND_PORT); + + // Authority (username, password, host, port) + bool have_authority; + if ((scheme_supports_user_info && + (parsed.username.is_valid() || parsed.password.is_valid())) || + parsed.host.is_nonempty() || + (scheme_supports_ports && parsed.port.is_valid())) { + have_authority = true; + + // Only write the authority separators when we have a scheme. + if (parsed.scheme.is_valid()) { + output->push_back('/'); + output->push_back('/'); + } + + // User info: the canonicalizer will handle the : and @. + if (scheme_supports_user_info) { + success &= CanonicalizeUserInfo( + source.username, parsed.username, source.password, parsed.password, + output, &new_parsed->username, &new_parsed->password); + } else { + new_parsed->username.reset(); + new_parsed->password.reset(); + } + + success &= CanonicalizeHost(source.host, parsed.host, + output, &new_parsed->host); + + // Host must not be empty for standard URLs. + if (parsed.host.is_empty()) + success = false; + + // Port: the port canonicalizer will handle the colon. + if (scheme_supports_ports) { + int default_port = DefaultPortForScheme( + &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len); + success &= CanonicalizePort(source.port, parsed.port, default_port, + output, &new_parsed->port); + } else { + new_parsed->port.reset(); + } + } else { + // No authority, clear the components. + have_authority = false; + new_parsed->host.reset(); + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->port.reset(); + success = false; // Standard URLs must have an authority. + } + + // Path + if (parsed.path.is_valid()) { + success &= CanonicalizePath(source.path, parsed.path, + output, &new_parsed->path); + } else if (have_authority || + parsed.query.is_valid() || parsed.ref.is_valid()) { + // When we have an empty path, make up a path when we have an authority + // or something following the path. The only time we allow an empty + // output path is when there is nothing else. + new_parsed->path = Component(output->length(), 1); + output->push_back('/'); + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + + // Ref: ignore failure for this, since the page can probably still be loaded. + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + // Carry over the flag for potentially dangling markup: + if (parsed.potentially_dangling_markup) + new_parsed->potentially_dangling_markup = true; + + return success; +} + +} // namespace + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +// +// Please keep blink::DefaultPortForProtocol and url::DefaultPortForProtocol in +// sync. +int DefaultPortForScheme(const char* scheme, int scheme_len) { + int default_port = PORT_UNSPECIFIED; + switch (scheme_len) { + case 4: + if (!strncmp(scheme, kHttpScheme, scheme_len)) + default_port = 80; + break; + case 5: + if (!strncmp(scheme, kHttpsScheme, scheme_len)) + default_port = 443; + break; + case 3: + if (!strncmp(scheme, kFtpScheme, scheme_len)) + default_port = 21; + else if (!strncmp(scheme, kWssScheme, scheme_len)) + default_port = 443; + break; + case 2: + if (!strncmp(scheme, kWsScheme, scheme_len)) + default_port = 80; + break; + } + return default_port; +} + +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeStandardURL( + URLComponentSource(spec), parsed, scheme_type, query_converter, + output, new_parsed); +} + +bool CanonicalizeStandardURL(const char16_t* spec, + int spec_len, + const Parsed& parsed, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + return DoCanonicalizeStandardURL( + URLComponentSource(spec), parsed, scheme_type, query_converter, + output, new_parsed); +} + +// It might be nice in the future to optimize this so unchanged components don't +// need to be recanonicalized. This is especially true since the common case for +// ReplaceComponents is removing things we don't want, like reference fragments +// and usernames. These cases can become more efficient if we can assume the +// rest of the URL is OK with these removed (or only the modified parts +// recanonicalized). This would be much more complex to implement, however. +// +// You would also need to update DoReplaceComponents in url_util.cc which +// relies on this re-checking everything (see the comment there for why). +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeStandardURL( + source, parsed, scheme_type, query_converter, output, new_parsed); +} + +// For 16-bit replacements, we turn all the replacements into UTF-8 so the +// regular code path can be used. +bool ReplaceStandardURL(const char* base, + const Parsed& base_parsed, + const Replacements& replacements, + SchemeType scheme_type, + CharsetConverter* query_converter, + CanonOutput* output, + Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource source(base); + Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeStandardURL( + source, parsed, scheme_type, query_converter, output, new_parsed); +} + +} // namespace url diff --git a/url_canon_unittest.cc b/url_canon_unittest.cc new file mode 100644 index 00000000000..dee00d86e22 --- /dev/null +++ b/url_canon_unittest.cc @@ -0,0 +1,2748 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_canon.h" + +#include +#include + +#include "base/strings/string_piece.h" +#include "base/strings/utf_string_conversions.h" +#include "base/test/gtest_util.h" +#include "base/test/scoped_feature_list.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon_internal.h" +#include "url/url_canon_stdstring.h" +#include "url/url_features.h" +#include "url/url_test_utils.h" + +namespace url { + +namespace { + +struct ComponentCase { + const char* input; + const char* expected; + Component expected_component; + bool expected_success; +}; + +// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests +// treat each input as optional, and will only try processing if non-NULL. +// The output is always 8-bit. +struct DualComponentCase { + const char* input8; + const wchar_t* input16; + const char* expected; + Component expected_component; + bool expected_success; +}; + +// Test cases for CanonicalizeIPAddress(). The inputs are identical to +// DualComponentCase, but the output has extra CanonHostInfo fields. +struct IPAddressCase { + const char* input8; + const wchar_t* input16; + const char* expected; + Component expected_component; + + // CanonHostInfo fields, for verbose output. + CanonHostInfo::Family expected_family; + int expected_num_ipv4_components; + const char* expected_address_hex; // Two hex chars per IP address byte. +}; + +std::string BytesToHexString(unsigned char bytes[16], int length) { + EXPECT_TRUE(length == 0 || length == 4 || length == 16) + << "Bad IP address length: " << length; + std::string result; + for (int i = 0; i < length; ++i) { + result.push_back(kHexCharLookup[(bytes[i] >> 4) & 0xf]); + result.push_back(kHexCharLookup[bytes[i] & 0xf]); + } + return result; +} + +struct ReplaceCase { + const char* base; + const char* scheme; + const char* username; + const char* password; + const char* host; + const char* port; + const char* path; + const char* query; + const char* ref; + const char* expected; +}; + +// Magic string used in the replacements code that tells SetupReplComp to +// call the clear function. +const char kDeleteComp[] = "|"; + +// Sets up a replacement for a single component. This is given pointers to +// the set and clear function for the component being replaced, and will +// either set the component (if it exists) or clear it (if the replacement +// string matches kDeleteComp). +// +// This template is currently used only for the 8-bit case, and the strlen +// causes it to fail in other cases. It is left a template in case we have +// tests for wide replacements. +template +void SetupReplComp( + void (Replacements::*set)(const CHAR*, const Component&), + void (Replacements::*clear)(), + Replacements* rep, + const CHAR* str) { + if (str && str[0] == kDeleteComp[0]) { + (rep->*clear)(); + } else if (str) { + (rep->*set)(str, Component(0, static_cast(strlen(str)))); + } +} + +} // namespace + +TEST(URLCanonTest, DoAppendUTF8) { + struct UTF8Case { + unsigned input; + const char* output; + } utf_cases[] = { + // Valid code points. + {0x24, "\x24"}, + {0xA2, "\xC2\xA2"}, + {0x20AC, "\xE2\x82\xAC"}, + {0x24B62, "\xF0\xA4\xAD\xA2"}, + {0x10FFFF, "\xF4\x8F\xBF\xBF"}, + }; + std::string out_str; + for (size_t i = 0; i < std::size(utf_cases); i++) { + out_str.clear(); + StdStringCanonOutput output(&out_str); + AppendUTF8Value(utf_cases[i].input, &output); + output.Complete(); + EXPECT_EQ(utf_cases[i].output, out_str); + } +} + +TEST(URLCanonTest, DoAppendUTF8Invalid) { + std::string out_str; + StdStringCanonOutput output(&out_str); + // Invalid code point (too large). + EXPECT_DCHECK_DEATH({ + AppendUTF8Value(0x110000, &output); + output.Complete(); + }); +} + +TEST(URLCanonTest, UTF) { + // Low-level test that we handle reading, canonicalization, and writing + // UTF-8/UTF-16 strings properly. + struct UTFCase { + const char* input8; + const wchar_t* input16; + bool expected_success; + const char* output; + } utf_cases[] = { + // Valid canonical input should get passed through & escaped. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"}, + // Test a character that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"}, + // Non-shortest-form UTF-8 characters are invalid. The bad bytes should + // each be replaced with the invalid character (EF BF DB in UTF-8). + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", nullptr, false, + "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"}, + // Invalid UTF-8 sequences should be marked as invalid (the first + // sequence is truncated). + {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"}, + // Character going off the end. + {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"}, + // ...same with low surrogates with no high surrogate. + {nullptr, L"\xdc00", false, "%EF%BF%BD"}, + // Test a UTF-8 encoded surrogate value is marked as invalid. + // ED A0 80 = U+D800 + {"\xed\xa0\x80", nullptr, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"}, + // ...even when paired. + {"\xed\xa0\x80\xed\xb0\x80", nullptr, false, + "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"}, + }; + + std::string out_str; + for (size_t i = 0; i < std::size(utf_cases); i++) { + if (utf_cases[i].input8) { + out_str.clear(); + StdStringCanonOutput output(&out_str); + + size_t input_len = strlen(utf_cases[i].input8); + bool success = true; + for (size_t ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + if (utf_cases[i].input16) { + out_str.clear(); + StdStringCanonOutput output(&out_str); + + std::u16string input_str( + test_utils::TruncateWStringToUTF16(utf_cases[i].input16)); + size_t input_len = input_str.length(); + bool success = true; + for (size_t ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + + if (utf_cases[i].input8 && utf_cases[i].input16 && + utf_cases[i].expected_success) { + // Check that the UTF-8 and UTF-16 inputs are equivalent. + + // UTF-16 -> UTF-8 + std::string input8_str(utf_cases[i].input8); + std::u16string input16_str( + test_utils::TruncateWStringToUTF16(utf_cases[i].input16)); + EXPECT_EQ(input8_str, base::UTF16ToUTF8(input16_str)); + + // UTF-8 -> UTF-16 + EXPECT_EQ(input16_str, base::UTF8ToUTF16(input8_str)); + } + } +} + +TEST(URLCanonTest, Scheme) { + // Here, we're mostly testing that unusual characters are handled properly. + // The canonicalizer doesn't do any parsing or whitespace detection. It will + // also do its best on error, and will escape funny sequences (these won't be + // valid schemes and it will return error). + // + // Note that the canonicalizer will append a colon to the output to separate + // out the rest of the URL, which is not present in the input. We check, + // however, that the output range includes everything but the colon. + ComponentCase scheme_cases[] = { + {"http", "http:", Component(0, 4), true}, + {"HTTP", "http:", Component(0, 4), true}, + {" HTTP ", "%20http%20:", Component(0, 10), false}, + {"htt: ", "htt%3A%20:", Component(0, 9), false}, + {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false}, + // Don't re-escape something already escaped. Note that it will + // "canonicalize" the 'A' to 'a', but that's OK. + {"ht%3Atp", "ht%3atp:", Component(0, 7), false}, + {"", ":", Component(0, 0), false}, + }; + + std::string out_str; + + for (size_t i = 0; i < std::size(scheme_cases); i++) { + int url_len = static_cast(strlen(scheme_cases[i].input)); + Component in_comp(0, url_len); + Component out_comp; + + out_str.clear(); + StdStringCanonOutput output1(&out_str); + bool success = CanonicalizeScheme(scheme_cases[i].input, in_comp, &output1, + &out_comp); + output1.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version. + out_str.clear(); + StdStringCanonOutput output2(&out_str); + + std::u16string wide_input(base::UTF8ToUTF16(scheme_cases[i].input)); + in_comp.len = static_cast(wide_input.length()); + success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2, + &out_comp); + output2.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + } + + // Test the case where the scheme is declared nonexistent, it should be + // converted into an empty scheme. + Component out_comp; + out_str.clear(); + StdStringCanonOutput output(&out_str); + + EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp)); + output.Complete(); + + EXPECT_EQ(std::string(":"), out_str); + EXPECT_EQ(0, out_comp.begin); + EXPECT_EQ(0, out_comp.len); +} + +// IDNA mode to use in CanonHost tests. +enum class IDNAMode { kTransitional, kNonTransitional }; + +class URLCanonHostTest + : public ::testing::Test, + public ::testing::WithParamInterface { + public: + URLCanonHostTest() { + if (GetParam() == IDNAMode::kNonTransitional) { + scoped_feature_list_.InitAndEnableFeature(kUseIDNA2008NonTransitional); + } else { + scoped_feature_list_.InitAndDisableFeature(kUseIDNA2008NonTransitional); + } + } + + private: + base::test::ScopedFeatureList scoped_feature_list_; +}; + +INSTANTIATE_TEST_SUITE_P(All, + URLCanonHostTest, + ::testing::Values(IDNAMode::kTransitional, + IDNAMode::kNonTransitional)); + +TEST_P(URLCanonHostTest, Host) { + bool use_idna_non_transitional = IsUsingIDNA2008NonTransitional(); + + IPAddressCase host_cases[] = { + // Basic canonicalization, uppercase should be converted to lowercase. + {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10), + CanonHostInfo::NEUTRAL, -1, ""}, + // Spaces and some other characters should be escaped. + {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", + Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""}, + // Exciting different types of spaces! + {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16), + CanonHostInfo::NEUTRAL, -1, ""}, + // Other types of space (no-break, zero-width, zero-width-no-break) are + // name-prepped away to nothing. + {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10), + CanonHostInfo::NEUTRAL, -1, ""}, + // Ideographic full stop (full-width period for Chinese, etc.) should be + // treated as a dot. + {NULL, + L"www.foo\x3002" + L"bar.com", + "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""}, + // Invalid unicode characters should fail... + // ...In wide input, ICU will barf and we'll end up with the input as + // escaped UTF-8 (the invalid character should be replaced with the + // replacement character). + {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", + Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, + // ...This is the same as previous but with with escaped. + {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", + Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, + // Test name prepping, fullwidth input should be converted to ASCII and + // NOT + // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16. + {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", + Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""}, + // Test that fullwidth escaped values are properly name-prepped, + // then converted or rejected. + // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", + "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", + "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + // ...%00 in fullwidth should fail (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", + "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", + "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + // ICU will convert weird percents into ASCII percents, but not unescape + // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a + // "small percent". At this point we should be within our rights to mark + // anything as invalid since the URL is corrupt or malicious. The code + // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped + // and kept as valid, so we validate that behavior here, but this level + // of fixing the input shouldn't be seen as required. "%81" is invalid. + {"\xef\xb9\xaa" + "41.com", + L"\xfe6a" + L"41.com", + "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"%ef%b9%aa" + "41.com", + L"\xfe6a" + L"41.com", + "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"\xef\xb9\xaa" + "81.com", + L"\xfe6a" + L"81.com", + "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + {"%ef%b9%aa" + "81.com", + L"\xfe6a" + L"81.com", + "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN + {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", + L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), + CanonHostInfo::NEUTRAL, -1, ""}, + // See http://unicode.org/cldr/utility/idna.jsp for other + // examples/experiments and http://goo.gl/7yG11o + // for the full list of characters handled differently by + // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008. + + // 4 Deviation characters are mapped/ignored in UTS 46 transitional + // mechansm. UTS 46, table 4 row (g). + // Sharp-s is mapped to 'ss' in IDNA 2003, not in IDNA 2008 or UTF 46 + // after transitional period. + // Previously, it'd be "fussball.de". + {"fu\xc3\x9f" + "ball.de", + L"fu\x00df" + L"ball.de", + use_idna_non_transitional ? "xn--fuball-cta.de" : "fussball.de", + use_idna_non_transitional ? Component(0, 17) : Component(0, 11), + CanonHostInfo::NEUTRAL, -1, ""}, + + // Final-sigma (U+03C3) was mapped to regular sigma (U+03C2). + // Previously, it'd be "xn--wxaikc9b". + {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2", + use_idna_non_transitional ? "xn--wxaijb9b" : "xn--wxaikc6b", + Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""}, + + // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional + // handling as well as in IDNA 2003, but not thereafter. + {"a\xe2\x80\x8c" + "b\xe2\x80\x8d" + "c", + L"a\x200c" + L"b\x200d" + L"c", + use_idna_non_transitional ? "xn--abc-9m0ag" : "abc", + use_idna_non_transitional ? Component(0, 13) : Component(0, 3), + CanonHostInfo::NEUTRAL, -1, ""}, + + // ZWJ between Devanagari characters was still mapped away in UTS 46 + // transitional handling. IDNA 2008 gives xn--11bo0mv54g. + // Previously "xn--11bo0m". + {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c", + L"\x915\x94d\x200d\x91c", + use_idna_non_transitional ? "xn--11bo0mv54g" : "xn--11bo0m", + use_idna_non_transitional ? Component(0, 14) : Component(0, 10), + CanonHostInfo::NEUTRAL, -1, ""}, + + // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b) + // However, we do allow this at the moment because we don't use + // STD3 rules and canonicalize full-width ASCII to ASCII. + {"wow\xef\xbc\x81", L"wow\xff01", "wow%21", Component(0, 6), + CanonHostInfo::NEUTRAL, -1, ""}, + // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c) + // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2 + {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", Component(0, 11), + CanonHostInfo::BROKEN, -1, ""}, + // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d) + // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2 + {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn", + "%F0%AF%A1%A8%E5%A7%BB.cn", Component(0, 24), CanonHostInfo::BROKEN, -1, + ""}, + // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e) + {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", Component(0, 14), + CanonHostInfo::NEUTRAL, -1, ""}, + // An already-IDNA host is not modified. + {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya", Component(0, 14), + CanonHostInfo::NEUTRAL, -1, ""}, + // Symbol/punctuations are allowed in IDNA 2003/UTS46. + // Not allowed in IDNA 2008. UTS 46 table 4 row (f). + {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", Component(0, 13), + CanonHostInfo::NEUTRAL, -1, ""}, + // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h) + // We used to allow it because we passed through unassigned code points. + {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com", + Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""}, + // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i) + // Used to be allowed in INDA 2003. + {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", Component(0, 9), + CanonHostInfo::BROKEN, -1, ""}, + // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based + // on Unicode 3.2). We did allow it in the past because we let unassigned + // code point pass. We continue to allow it even though it's a + // "punctuation and symbol" blocked in IDNA 2008. + // UTS 46 table 4, row (j) + {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", Component(0, 11), + CanonHostInfo::NEUTRAL, -1, ""}, + // Maps uppercase letters to lower case letters. + // In IDNA 2003, it's allowed without case-folding + // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2 + // (added in Unicode 4.1). UTS 46 table 4 row (k) + {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", Component(0, 15), + CanonHostInfo::NEUTRAL, -1, ""}, + // Maps U+FF43 (Full Width Small Letter C) to 'c'. + {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz", Component(0, 7), + CanonHostInfo::NEUTRAL, -1, ""}, + // Maps U+1D68C (Math Monospace Small C) to 'c'. + // U+1D68C = \xD835\xDE8C in UTF-16 + {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz", + Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""}, + // BiDi check test + // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM. + // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008. + {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8", + L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", Component(0, 13), + CanonHostInfo::NEUTRAL, -1, ""}, + // Disallowed in both IDNA 2003 and 2008 with BiDi check. + // Labels starting with a RTL character cannot end with a LTR character. + {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz", + "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), CanonHostInfo::BROKEN, -1, + ""}, + // Labels starting with a RTL character can end with BC=EN (European + // number). Disallowed in IDNA 2003 but now allowed. + {"\xd8\xac\xd8\xa7\xd8\xb1" + "2", + L"\x62c\x627\x631" + L"2", + "xn--2-ymcov", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""}, + // Labels starting with a RTL character cannot have "L" characters + // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008. + {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2", + "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), CanonHostInfo::BROKEN, -1, + ""}, + // Labels starting with a RTL character can end with BC=AN (Arabic number) + // Disallowed in IDNA 2003, but now allowed. + {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662", + "xn--mgbjq0r", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""}, + // Labels starting with a RTL character cannot have "L" characters + // even if it ends with an BC=AN (Arabic number). + // Disallowed in both IDNA 2003/2008. + {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662", + "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), CanonHostInfo::BROKEN, + -1, ""}, + // Labels starting with a RTL character cannot mix BC=EN and BC=AN + {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662", + "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), CanonHostInfo::BROKEN, + -1, ""}, + // As of Unicode 6.2, U+20CF is not assigned. We do not allow it. + {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", Component(0, 13), + CanonHostInfo::BROKEN, -1, ""}, + // U+0080 is not allowed. + {"\xc2\x80.com", L"\x80.com", "%C2%80.com", Component(0, 10), + CanonHostInfo::BROKEN, -1, ""}, + // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped + // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped + // UTF-8 (wide case). The output should be equivalent to the true wide + // character input above). + {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", + L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), + CanonHostInfo::NEUTRAL, -1, ""}, + // Invalid escaped characters should fail and the percents should be + // escaped. + {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10), + CanonHostInfo::BROKEN, -1, ""}, + // If we get an invalid character that has been escaped. + {"%25", L"%25", "%25", Component(0, 3), CanonHostInfo::BROKEN, -1, ""}, + {"hello%00", L"hello%00", "hello%00", Component(0, 8), + CanonHostInfo::BROKEN, -1, ""}, + // Escaped numbers should be treated like IP addresses if they are. + {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", + "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"}, + {"%30%78%63%30%2e%30%32%35%30.01%2e", + L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", Component(0, 11), + CanonHostInfo::IPV4, 3, "C0A80001"}, + // Invalid escaping should trigger the regular host error handling. + {"%3g%78%63%30%2e%30%32%35%30%2E.01", + L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", + Component(0, 17), CanonHostInfo::BROKEN, -1, ""}, + // Something that isn't exactly an IP should get treated as a host and + // spaces escaped. + {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", + Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""}, + // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP. + // These are "0Xc0.0250.01" in fullwidth. + {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%" + "8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%" + "8E\xef\xbc\x90\xef\xbc\x91", + L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10" + L"\xff11", + "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"}, + // Broken IP addresses get marked as such. + {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13), + CanonHostInfo::BROKEN, -1, ""}, + {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12), + CanonHostInfo::BROKEN, -1, ""}, + // Cyrillic letter followed by '(' should return punycode for '(' escaped + // before punycode string was created. I.e. + // if '(' is escaped after punycode is created we would get xn--%28-8tb + // (incorrect). + {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11), + CanonHostInfo::NEUTRAL, -1, ""}, + // Address with all hexadecimal characters with leading number of 1<<32 + // or greater and should return NEUTRAL rather than BROKEN if not all + // components are numbers. + {"12345678912345.de", L"12345678912345.de", "12345678912345.de", + Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""}, + {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de", + Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""}, + {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de", + "12345678912345.12345678912345.de", Component(0, 32), + CanonHostInfo::NEUTRAL, -1, ""}, + {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de", + Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""}, + {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde", + Component(0, 19), CanonHostInfo::BROKEN, -1, ""}, + // A label that starts with "xn--" but contains non-ASCII characters + // should + // be an error. Escape the invalid characters. + {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen", + Component(0, 16), CanonHostInfo::BROKEN, -1, ""}, + }; + + // CanonicalizeHost() non-verbose. + std::string out_str; + for (size_t i = 0; i < std::size(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast(strlen(host_cases[i].input8)); + Component in_comp(0, host_len); + Component out_comp; + + out_str.clear(); + StdStringCanonOutput output(&out_str); + + bool success = CanonicalizeHost(host_cases[i].input8, in_comp, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success) << "for input: " << host_cases[i].input8; + EXPECT_EQ(std::string(host_cases[i].expected), out_str) << + "for input: " << host_cases[i].input8; + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin) << + "for input: " << host_cases[i].input8; + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len) << + "for input: " << host_cases[i].input8; + } + + // Wide version. + if (host_cases[i].input16) { + std::u16string input16( + test_utils::TruncateWStringToUTF16(host_cases[i].input16)); + int host_len = static_cast(input16.length()); + Component in_comp(0, host_len); + Component out_comp; + + out_str.clear(); + StdStringCanonOutput output(&out_str); + + bool success = CanonicalizeHost(input16.c_str(), in_comp, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len); + } + } + + // CanonicalizeHostVerbose() + for (size_t i = 0; i < std::size(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast(strlen(host_cases[i].input8)); + Component in_comp(0, host_len); + + out_str.clear(); + StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + CanonicalizeHostVerbose(host_cases[i].input8, in_comp, &output, + &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(std::string(host_cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + + // Wide version. + if (host_cases[i].input16) { + std::u16string input16( + test_utils::TruncateWStringToUTF16(host_cases[i].input16)); + int host_len = static_cast(input16.length()); + Component in_comp(0, host_len); + + out_str.clear(); + StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(std::string(host_cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + } +} + +TEST(URLCanonTest, IPv4) { + // clang-format off + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Regular IP addresses in different bases. + {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + // Non-IP addresses due to invalid characters. + {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Hostnames with a numeric final component but other components that don't + // parse as numbers should be considered broken. + {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"19a.168.0.1.", L"19a.168.0.1.", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0308.0250.00.01.", L"0308.0250.00.01.", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0xCG.0xA8.0x0.0x1.", L"0xCG.0xA8.0x0.0x1.", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Non-numeric terminal compeonent should be considered not IPv4 hostnames, but valid. + {"19.168.0.1a", L"19.168.0.1a", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"0xC.0xA8.0x0.0x1G", L"0xC.0xA8.0x0.0x1G", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Hostnames that would be considered broken IPv4 hostnames should be considered valid non-IPv4 hostnames if they end with two dots instead of 0 or 1. + {"19a.168.0.1..", L"19a.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"0308.0250.00.01..", L"0308.0250.00.01..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"0xCG.0xA8.0x0.0x1..", L"0xCG.0xA8.0x0.0x1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Hosts with components that aren't considered valid IPv4 numbers but are entirely numeric should be considered invalid. + {"1.2.3.08", L"1.2.3.08", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"1.2.3.08.", L"1.2.3.08.", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // If there are not enough components, the last one should fill them out. + {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"}, + {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, + {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, + {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"}, + {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"}, + {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"}, + {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"}, + {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"}, + // Hostnames with too many components, but a numeric final numeric component are invalid. + {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // We allow a single trailing dot. + {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"}, + {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Hosts with two dots in a row with a final numeric component are considered invalid. + {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"192.168..1.", L"192.168..1.", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Any numerical overflow should be marked as BROKEN. + {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Repeat the previous tests, minus 1, to verify boundaries. + {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"}, + {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"}, + {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"}, + {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"}, + {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"}, + {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"}, + {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"}, + {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"}, + {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"}, + // Old trunctations tests. They're all "BROKEN" now. + {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Too many components should be rejected, in valid ranges or not. + {"255.255.255.255.255", L"255.255.255.255.255", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"256.256.256.256.256", L"256.256.256.256.256", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Spaces should be rejected. + {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Very large numbers. + {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"}, + {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""}, + // A number has no length limit, but long numbers can still overflow. + {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"}, + {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // If a long component is non-numeric, it's a hostname, *not* a broken IP. + {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Truncation of all zeros should still result in 0. + {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"}, + // Non-ASCII characters in final component should return NEUTRAL. + {"1.2.3.\xF0\x9F\x92\xA9", L"1.2.3.\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"1.2.3.4\xF0\x9F\x92\xA9", L"1.2.3.4\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"1.2.3.0x\xF0\x9F\x92\xA9", L"1.2.3.0x\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + {"1.2.3.0\xF0\x9F\x92\xA9", L"1.2.3.0\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Non-ASCII characters in other components should result in broken IPs when final component is numeric. + {"1.2.\xF0\x9F\x92\xA9.4", L"1.2.\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"1.2.3\xF0\x9F\x92\xA9.4", L"1.2.3\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"1.2.0x\xF0\x9F\x92\xA9.4", L"1.2.0x\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"1.2.0\xF0\x9F\x92\xA9.4", L"1.2.0\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"\xF0\x9F\x92\xA9.2.3.4", L"\xD83D\xDCA9.2.3.4", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + }; + // clang-format on + + for (const auto& test_case : cases) { + SCOPED_TRACE(test_case.input8); + + // 8-bit version. + Component component(0, static_cast(strlen(test_case.input8))); + + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + CanonHostInfo host_info; + CanonicalizeIPAddress(test_case.input8, component, &output1, &host_info); + output1.Complete(); + + EXPECT_EQ(test_case.expected_family, host_info.family); + EXPECT_EQ(std::string(test_case.expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(test_case.expected, out_str1.c_str()); + EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len); + EXPECT_EQ(test_case.expected_num_ipv4_components, + host_info.num_ipv4_components); + } + + // 16-bit version. + std::u16string input16( + test_utils::TruncateWStringToUTF16(test_case.input16)); + component = Component(0, static_cast(input16.length())); + + std::string out_str2; + StdStringCanonOutput output2(&out_str2); + CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info); + output2.Complete(); + + EXPECT_EQ(test_case.expected_family, host_info.family); + EXPECT_EQ(std::string(test_case.expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(test_case.expected, out_str2.c_str()); + EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len); + EXPECT_EQ(test_case.expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } +} + +class URLCanonIPv6Test + : public ::testing::Test, + public ::testing::WithParamInterface { + public: + URLCanonIPv6Test() { + if (GetParam()) { + scoped_feature_list_.InitAndEnableFeature(kStrictIPv4EmbeddedIPv6AddressParsing); + } else { + scoped_feature_list_.InitAndDisableFeature(kStrictIPv4EmbeddedIPv6AddressParsing); + } + } + + private: + base::test::ScopedFeatureList scoped_feature_list_; +}; + +INSTANTIATE_TEST_SUITE_P(All, + URLCanonIPv6Test, + ::testing::Bool()); + +TEST_P(URLCanonIPv6Test, IPv6) { + bool strict_ipv4_embedded_ipv6_parsing = + base::FeatureList::IsEnabled(url::kStrictIPv4EmbeddedIPv6AddressParsing); + + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""}, + // Non-IPs with [:] characters are marked BROKEN. + {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Regular IP address is invalid without bounding '[' and ']'. + {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Regular IP addresses. + {"[::]", L"[::]", "[::]", Component(0,4), CanonHostInfo::IPV6, -1, "00000000000000000000000000000000"}, + {"[::1]", L"[::1]", "[::1]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000001"}, + {"[1::]", L"[1::]", "[1::]", Component(0,5), CanonHostInfo::IPV6, -1, "00010000000000000000000000000000"}, + + // Leading zeros should be stripped. + {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", Component(0,17), CanonHostInfo::IPV6, -1, "00000001000200030004000500060007"}, + + // Upper case letters should be lowercased. + {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", Component(0,20), CanonHostInfo::IPV6, -1, "000A000B000C00DE00FF0000000100AC"}, + + // The same address can be written with different contractions, but should + // get canonicalized to the same thing. + {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"}, + {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"}, + + // Addresses with embedded IPv4. + {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0,10), CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"}, + {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"}, + {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1, "00000000000000000000EEEEC0A80001"}, + {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]", Component(0, 14), CanonHostInfo::IPV6, -1, "200100000000000000000000C0A80001"}, + {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // IPv4 embedded IPv6 addresses + {"[::ffff:192.1.2]", + L"[::ffff:192.1.2]", + "[::ffff:c001:2]", + strict_ipv4_embedded_ipv6_parsing ? Component() : Component(0,15), + strict_ipv4_embedded_ipv6_parsing ? CanonHostInfo::BROKEN : CanonHostInfo::IPV6, + -1, + (strict_ipv4_embedded_ipv6_parsing ? "" : "00000000000000000000FFFFC0010002")}, + {"[::ffff:192.1]", + L"[::ffff:192.1]", + "[::ffff:c000:1]", + strict_ipv4_embedded_ipv6_parsing ? Component() : Component(0,15), + strict_ipv4_embedded_ipv6_parsing ? CanonHostInfo::BROKEN : CanonHostInfo::IPV6, + -1, + (strict_ipv4_embedded_ipv6_parsing ? "" : "00000000000000000000FFFFC0000001")}, + {"[::ffff:192.1.2.3.4]", + L"[::ffff:192.1.2.3.4]", + "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // IPv4 using hex. + // TODO(eroman): Should this format be disallowed? + {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"}, + + // There may be zeros surrounding the "::" contraction. + {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"}, + + {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"}, + + // Can only have one "::" contraction in an IPv6 string literal. + {"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // No more than 2 consecutive ':'s. + {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Non-IP addresses due to invalid characters. + {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // If there are not enough components, the last one should fill them out. + // ... omitted at this time ... + // Too many components means not an IP address. Similarly, with too few + // if using IPv4 compat or mapped addresses. + {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Too many bits (even though 8 comonents, the last one holds 32 bits). + {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Too many bits specified -- the contraction would have to be zero-length + // to not exceed 128 bits. + {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // The contraction is for 16 bits of zero. + {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", Component(0,17), CanonHostInfo::IPV6, -1, "00010002000300040005000600000008"}, + + // Cannot have a trailing colon. + {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Cannot have negative numbers. + {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Scope ID -- the URL may contain an optional ["%" ] section. + // The scope_id should be included in the canonicalized URL, and is an + // unsigned decimal number. + + // Invalid because no ID was given after the percent. + + // Don't allow scope-id + {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // Don't allow leading or trailing colons. + {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + + // We allow a single trailing dot. + // ... omitted at this time ... + // Two dots in a row means not an IP address. + {"[::192.168..1]", L"[::192.168..1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + // Any non-first components get truncated to one byte. + // ... omitted at this time ... + // Spaces should be rejected. + {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, + }; + + for (size_t i = 0; i < std::size(cases); i++) { + // 8-bit version. + Component component(0, static_cast(strlen(cases[i].input8))); + + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + CanonHostInfo host_info; + CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info); + output1.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())) << "iter " << i << " host " << cases[i].input8; + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str1.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + + // 16-bit version. + std::u16string input16( + test_utils::TruncateWStringToUTF16(cases[i].input16)); + component = Component(0, static_cast(input16.length())); + + std::string out_str2; + StdStringCanonOutput output2(&out_str2); + CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info); + output2.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(cases[i].expected_address_hex), + BytesToHexString(host_info.address, host_info.AddressLength())); + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str2.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + } +} + +TEST(URLCanonTest, IPEmpty) { + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + CanonHostInfo host_info; + + // This tests tests. + const char spec[] = "192.168.0.1"; + CanonicalizeIPAddress(spec, Component(), &output1, &host_info); + EXPECT_FALSE(host_info.IsIPAddress()); + + CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info); + EXPECT_FALSE(host_info.IsIPAddress()); +} + +// Verifies that CanonicalizeHostSubstring produces the expected output and +// does not "fix" IP addresses. Because this code is a subset of +// CanonicalizeHost, the shared functionality is not tested. +TEST(URLCanonTest, CanonicalizeHostSubstring) { + // Basic sanity check. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com", + Component(0, 12), &output)); + output.Complete(); + EXPECT_EQ("xn--mnchen-3ya.com", out_str); + } + + // Failure case. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_FALSE(CanonicalizeHostSubstring( + test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(), + Component(0, 8), &output)); + output.Complete(); + EXPECT_EQ("%EF%BF%BDzyx.com", out_str); + } + + // Should return true for empty input strings. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output)); + output.Complete(); + EXPECT_EQ(std::string(), out_str); + } + + // Numbers that look like IP addresses should not be changed. + { + std::string out_str; + StdStringCanonOutput output(&out_str); + EXPECT_TRUE( + CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output)); + output.Complete(); + EXPECT_EQ("01.02.03.04", out_str); + } +} + +TEST(URLCanonTest, UserInfo) { + // Note that the canonicalizer should escape and treat empty components as + // not being there. + + // We actually parse a full input URL so we can get the initial components. + struct UserComponentCase { + const char* input; + const char* expected; + Component expected_username; + Component expected_password; + bool expected_success; + } user_info_cases[] = { + {"http://user:pass@host.com/", "user:pass@", Component(0, 4), Component(5, 4), true}, + {"http://@host.com/", "", Component(0, -1), Component(0, -1), true}, + {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true}, + {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true}, + {"http://:foo@host.com/", ":foo@", Component(0, 0), Component(1, 3), true}, + {"http://^ :$\t@host.com/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true}, + {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true}, + {"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true }, + + // IE7 compatibility: old versions allowed backslashes in usernames, but + // IE7 does not. We disallow it as well. + {"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true}, + }; + + for (size_t i = 0; i < std::size(user_info_cases); i++) { + int url_len = static_cast(strlen(user_info_cases[i].input)); + Parsed parsed; + ParseStandardURL(user_info_cases[i].input, url_len, &parsed); + Component out_user, out_pass; + std::string out_str; + StdStringCanonOutput output1(&out_str); + + bool success = CanonicalizeUserInfo(user_info_cases[i].input, + parsed.username, + user_info_cases[i].input, + parsed.password, + &output1, + &out_user, + &out_pass); + output1.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + + // Now try the wide version + out_str.clear(); + StdStringCanonOutput output2(&out_str); + std::u16string wide_input(base::UTF8ToUTF16(user_info_cases[i].input)); + success = CanonicalizeUserInfo(wide_input.c_str(), + parsed.username, + wide_input.c_str(), + parsed.password, + &output2, + &out_user, + &out_pass); + output2.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + } +} + +TEST(URLCanonTest, Port) { + // We only need to test that the number gets properly put into the output + // buffer. The parser unit tests will test scanning the number correctly. + // + // Note that the CanonicalizePort will always prepend a colon to the output + // to separate it from the colon that it assumes precedes it. + struct PortCase { + const char* input; + int default_port; + const char* expected; + Component expected_component; + bool expected_success; + } port_cases[] = { + // Invalid input should be copied w/ failure. + {"as df", 80, ":as%20df", Component(1, 7), false}, + {"-2", 80, ":-2", Component(1, 2), false}, + // Default port should be omitted. + {"80", 80, "", Component(0, -1), true}, + {"8080", 80, ":8080", Component(1, 4), true}, + // PORT_UNSPECIFIED should mean always keep the port. + {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true}, + }; + + for (size_t i = 0; i < std::size(port_cases); i++) { + int url_len = static_cast(strlen(port_cases[i].input)); + Component in_comp(0, url_len); + Component out_comp; + std::string out_str; + StdStringCanonOutput output1(&out_str); + bool success = CanonicalizePort(port_cases[i].input, + in_comp, + port_cases[i].default_port, + &output1, + &out_comp); + output1.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version + out_str.clear(); + StdStringCanonOutput output2(&out_str); + std::u16string wide_input(base::UTF8ToUTF16(port_cases[i].input)); + success = CanonicalizePort(wide_input.c_str(), + in_comp, + port_cases[i].default_port, + &output2, + &out_comp); + output2.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + } +} + +DualComponentCase kCommonPathCases[] = { + // ----- path collapsing tests ----- + {"/././foo", L"/././foo", "/foo", Component(0, 4), true}, + {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true}, + {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true}, + {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true}, + // double dots followed by a slash or the end of the string count + {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true}, + {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true}, + // don't count double dots when they aren't followed by a slash + {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true}, + // some in the middle + {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true}, + {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", + Component(0, 2), true}, + // we should not be able to go above the root + {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true}, + {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true}, + // escaped dots should be unescaped and treated the same as dots + {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true}, + {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true}, + {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", + "/..bar", Component(0, 6), true}, + // Multiple slashes in a row should be preserved and treated like empty + // directory names. + {"////../..", L"////../..", "//", Component(0, 2), true}, + + // ----- escaping tests ----- + {"/foo", L"/foo", "/foo", Component(0, 4), true}, + // Valid escape sequence + {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true}, + // Invalid escape sequence we should pass through unchanged. + {"/foo%", L"/foo%", "/foo%", Component(0, 5), true}, + {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true}, + // Invalid escape sequence: bad characters should be treated the same as + // the surrounding text, not as escaped (in this case, UTF-8). + {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true}, + {"/foo%2\xc2\xa9zbar", nullptr, "/foo%2%C2%A9zbar", Component(0, 16), true}, + {nullptr, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22), + true}, + // Regular characters that are escaped should be unescaped + {"/foo%41%7a", L"/foo%41%7a", "/fooAz", Component(0, 6), true}, + // Funny characters that are unescaped should be escaped + {"/foo\x09\x91%91", nullptr, "/foo%09%91%91", Component(0, 13), true}, + {nullptr, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true}, + // Invalid characters that are escaped should cause a failure. + {"/foo%00%51", L"/foo%00%51", "/foo%00Q", Component(0, 8), false}, + // Some characters should be passed through unchanged regardless of esc. + {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13), + true}, + // Characters that are properly escaped should not have the case changed + // of hex letters. + {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13), + true}, + // Funny characters that are unescaped should be escaped + {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true}, + // Backslashes should get converted to forward slashes + {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", Component(0, 8), true}, + // Hashes found in paths (possibly only when the caller explicitly sets + // the path on an already-parsed URL) should be escaped. + {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true}, + // %7f should be allowed and %3D should not be unescaped (these were wrong + // in a previous version). + {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", + "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true}, + // @ should be passed through unchanged (escaped or unescaped). + {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true}, + // Nested escape sequences should result in escaping the leading '%' if + // unescaping would result in a new escape sequence. + {"/%A%42", L"/%A%42", "/%25AB", Component(0, 6), true}, + {"/%%41B", L"/%%41B", "/%25AB", Component(0, 6), true}, + {"/%%41%42", L"/%%41%42", "/%25AB", Component(0, 6), true}, + // Make sure truncated "nested" escapes don't result in reading off the + // string end. + {"/%%41", L"/%%41", "/%A", Component(0, 3), true}, + // Don't unescape the leading '%' if unescaping doesn't result in a valid + // new escape sequence. + {"/%%470", L"/%%470", "/%G0", Component(0, 4), true}, + {"/%%2D%41", L"/%%2D%41", "/%-A", Component(0, 4), true}, + // Don't erroneously downcast a UTF-16 character in a way that makes it + // look like part of an escape sequence. + {nullptr, L"/%%41\x0130", "/%A%C4%B0", Component(0, 9), true}, + + // ----- encoding tests ----- + // Basic conversions + {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", + L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", + Component(0, 37), true}, + // Invalid unicode characters should fail. We only do validation on + // UTF-16 input, so this doesn't happen on 8-bit. + {"/\xef\xb7\x90zyx", nullptr, "/%EF%B7%90zyx", Component(0, 13), true}, + {nullptr, L"/\xfdd0zyx", "/%EF%BF%BDzyx", Component(0, 13), false}, +}; + +typedef bool (*CanonFunc8Bit)(const char*, + const Component&, + CanonOutput*, + Component*); +typedef bool (*CanonFunc16Bit)(const char16_t*, + const Component&, + CanonOutput*, + Component*); + +void DoPathTest(const DualComponentCase* path_cases, + size_t num_cases, + CanonFunc8Bit canon_func_8, + CanonFunc16Bit canon_func_16) { + for (size_t i = 0; i < num_cases; i++) { + testing::Message scope_message; + scope_message << path_cases[i].input8 << "," << path_cases[i].input16; + SCOPED_TRACE(scope_message); + if (path_cases[i].input8) { + int len = static_cast(strlen(path_cases[i].input8)); + Component in_comp(0, len); + Component out_comp; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = + canon_func_8(path_cases[i].input8, in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + + if (path_cases[i].input16) { + std::u16string input16( + test_utils::TruncateWStringToUTF16(path_cases[i].input16)); + int len = static_cast(input16.length()); + Component in_comp(0, len); + Component out_comp; + std::string out_str; + StdStringCanonOutput output(&out_str); + + bool success = + canon_func_16(input16.c_str(), in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + } +} + +TEST(URLCanonTest, Path) { + DoPathTest(kCommonPathCases, std::size(kCommonPathCases), CanonicalizePath, + CanonicalizePath); + + // Manual test: embedded NULLs should be escaped and the URL should be marked + // as invalid. + const char path_with_null[] = "/ab\0c"; + Component in_comp(0, 5); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizePath(path_with_null, in_comp, &output, &out_comp); + output.Complete(); + EXPECT_FALSE(success); + EXPECT_EQ("/ab%00c", out_str); +} + +TEST(URLCanonTest, PartialPath) { + DualComponentCase partial_path_cases[] = { + {".html", L".html", ".html", Component(0, 5), true}, + {"", L"", "", Component(0, 0), true}, + }; + + DoPathTest(kCommonPathCases, std::size(kCommonPathCases), + CanonicalizePartialPath, CanonicalizePartialPath); + DoPathTest(partial_path_cases, std::size(partial_path_cases), + CanonicalizePartialPath, CanonicalizePartialPath); +} + +TEST(URLCanonTest, Query) { + struct QueryCase { + const char* input8; + const wchar_t* input16; + const char* expected; + } query_cases[] = { + // Regular ASCII case. + {"foo=bar", L"foo=bar", "?foo=bar"}, + // Allow question marks in the query without escaping + {"as?df", L"as?df", "?as?df"}, + // Always escape '#' since it would mark the ref. + {"as#df", L"as#df", "?as%23df"}, + // Escape some questionable 8-bit characters, but never unescape. + {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"}, + {"%40%41123", L"%40%41123", "?%40%41123"}, + // Chinese input/output + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"}, + // Invalid UTF-8/16 input should be replaced with invalid characters. + {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"}, + // Don't allow < or > because sometimes they are used for XSS if the + // URL is echoed in content. Firefox does this, IE doesn't. + {"q=", L"q=", "?q=%3Casdf%3E"}, + // Escape double quotemarks in the query. + {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"}, + }; + + for (size_t i = 0; i < std::size(query_cases); i++) { + Component out_comp; + + if (query_cases[i].input8) { + int len = static_cast(strlen(query_cases[i].input8)); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(query_cases[i].input8, in_comp, NULL, &output, + &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + + if (query_cases[i].input16) { + std::u16string input16( + test_utils::TruncateWStringToUTF16(query_cases[i].input16)); + int len = static_cast(input16.length()); + Component in_comp(0, len); + std::string out_str; + + StdStringCanonOutput output(&out_str); + CanonicalizeQuery(input16.c_str(), in_comp, NULL, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + } + + // Extra test for input with embedded NULL; + std::string out_str; + StdStringCanonOutput output(&out_str); + Component out_comp; + CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); + output.Complete(); + EXPECT_EQ("?a%20%00z%01", out_str); +} + +TEST(URLCanonTest, Ref) { + // Refs are trivial, it just checks the encoding. + DualComponentCase ref_cases[] = { + {"hello!", L"hello!", "#hello!", Component(1, 6), true}, + // We should escape spaces, double-quotes, angled braces, and backtics. + {"hello, world", L"hello, world", "#hello,%20world", Component(1, 14), + true}, + {"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14), + true}, + {"hello,world", L"hello,>world", "#hello,%3Eworld", Component(1, 14), + true}, + {"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14), + true}, + // UTF-8/wide input should be preserved + {"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true}, + // Test a characer that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#%F0%90%8C%80ss", + Component(1, 14), true}, + // Escaping should be preserved unchanged, even invalid ones + {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true}, + // Invalid UTF-8/16 input should be flagged and the input made valid + {"\xc2", nullptr, "#%EF%BF%BD", Component(1, 9), true}, + {nullptr, L"\xd800\x597d", "#%EF%BF%BD%E5%A5%BD", Component(1, 18), true}, + // Test a Unicode invalid character. + {"a\xef\xb7\x90", L"a\xfdd0", "#a%EF%BF%BD", Component(1, 10), true}, + // Refs can have # signs and we should preserve them. + {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true}, + {"#asdf", L"#asdf", "##asdf", Component(1, 5), true}, + }; + + for (size_t i = 0; i < std::size(ref_cases); i++) { + // 8-bit input + if (ref_cases[i].input8) { + int len = static_cast(strlen(ref_cases[i].input8)); + Component in_comp(0, len); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + CanonicalizeRef(ref_cases[i].input8, in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + + // 16-bit input + if (ref_cases[i].input16) { + std::u16string input16( + test_utils::TruncateWStringToUTF16(ref_cases[i].input16)); + int len = static_cast(input16.length()); + Component in_comp(0, len); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + } + + // Try one with an embedded NULL. It should be stripped. + const char null_input[5] = "ab\x00z"; + Component null_input_component(0, 4); + Component out_comp; + + std::string out_str; + StdStringCanonOutput output(&out_str); + CanonicalizeRef(null_input, null_input_component, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(1, out_comp.begin); + EXPECT_EQ(6, out_comp.len); + EXPECT_EQ("#ab%00z", out_str); +} + +TEST(URLCanonTest, CanonicalizeStandardURL) { + // The individual component canonicalize tests should have caught the cases + // for each of those components. Here, we just need to test that the various + // parts are included or excluded properly, and have the correct separators. + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + } cases[] = { + {"http://www.google.com/foo?bar=baz#", + "http://www.google.com/foo?bar=baz#", true}, + {"http://[www.google.com]/", "http://[www.google.com]/", false}, + {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", + false}, + {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", + true}, + {"www.google.com", ":www.google.com/", false}, + {"http://192.0x00A80001", "http://192.168.0.1/", true}, + {"http://www/foo%2Ehtml", "http://www/foo.html", true}, + {"http://user:pass@/", "http://user:pass@/", false}, + {"http://%25DOMAIN:foobar@foodomain.com/", + "http://%25DOMAIN:foobar@foodomain.com/", true}, + + // Backslashes should get converted to forward slashes. + {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true}, + + // Busted refs shouldn't make the whole thing fail. + {"http://www.google.com/asdf#\xc2", + "http://www.google.com/asdf#%EF%BF%BD", true}, + + // Basic port tests. + {"http://foo:80/", "http://foo/", true}, + {"http://foo:81/", "http://foo:81/", true}, + {"httpa://foo:80/", "httpa://foo:80/", true}, + {"http://foo:-80/", "http://foo:-80/", false}, + + {"https://foo:443/", "https://foo/", true}, + {"https://foo:80/", "https://foo:80/", true}, + {"ftp://foo:21/", "ftp://foo/", true}, + {"ftp://foo:80/", "ftp://foo:80/", true}, + {"gopher://foo:70/", "gopher://foo:70/", true}, + {"gopher://foo:443/", "gopher://foo:443/", true}, + {"ws://foo:80/", "ws://foo/", true}, + {"ws://foo:81/", "ws://foo:81/", true}, + {"ws://foo:443/", "ws://foo:443/", true}, + {"ws://foo:815/", "ws://foo:815/", true}, + {"wss://foo:80/", "wss://foo:80/", true}, + {"wss://foo:81/", "wss://foo:81/", true}, + {"wss://foo:443/", "wss://foo/", true}, + {"wss://foo:815/", "wss://foo:815/", true}, + + // This particular code path ends up "backing up" to replace an invalid + // host ICU generated with an escaped version. Test that in the context + // of a full URL to make sure the backing up doesn't mess up the non-host + // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that + // ICU will convert to an ASCII one, generating "%81". + {"ws:)W\x1eW\xef\xb9\xaa" + "81:80/", + "ws://%29w%1ew%81/", false}, + // Regression test for the last_invalid_percent_index bug described in + // https://crbug.com/1080890#c10. + {R"(HTTP:S/5%\../>%41)", "http://s/%3EA", true}, + }; + + for (size_t i = 0; i < std::size(cases); i++) { + int url_len = static_cast(strlen(cases[i].input)); + Parsed parsed; + ParseStandardURL(cases[i].input, url_len, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeStandardURL( + cases[i].input, url_len, parsed, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + } +} + +// The codepath here is the same as for regular canonicalization, so we just +// need to test that things are replaced or not correctly. +TEST(URLCanonTest, ReplaceStandardURL) { + ReplaceCase replace_cases[] = { + // Common case of truncating the path. + {"http://www.google.com/foo?bar=baz#ref", nullptr, nullptr, nullptr, + nullptr, nullptr, "/", kDeleteComp, kDeleteComp, + "http://www.google.com/"}, + // Replace everything + {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", + "host.com", "99", "/path", "query", "ref", + "https://me:pw@host.com:99/path?query#ref"}, + // Replace nothing + {"http://a:b@google.com:22/foo?baz@cat", nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + "http://a:b@google.com:22/foo?baz@cat"}, + // Replace scheme with filesystem. The result is garbage, but you asked + // for it. + {"http://a:b@google.com:22/foo?baz@cat", "filesystem", nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + "filesystem://a:b@google.com:22/foo?baz@cat"}, + }; + + for (size_t i = 0; i < std::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast(strlen(cur.base)); + Parsed parsed; + ParseStandardURL(cur.base, base_len, &parsed); + + Replacements r; + typedef Replacements R; // Clean up syntax. + + // Note that for the scheme we pass in a different clear function since + // there is no function to clear the scheme. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceStandardURL(replace_cases[i].base, parsed, r, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, + &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } + + // The path pointer should be ignored if the address is invalid. + { + const char src[] = "http://www.google.com/here_is_the_path"; + int src_len = static_cast(strlen(src)); + + Parsed parsed; + ParseStandardURL(src, src_len, &parsed); + + // Replace the path to 0 length string. By using 1 as the string address, + // the test should get an access violation if it tries to dereference it. + Replacements r; + r.SetPath(reinterpret_cast(0x00000001), Component(0, 0)); + std::string out_str1; + StdStringCanonOutput output1(&out_str1); + Parsed new_parsed; + ReplaceStandardURL(src, parsed, r, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, + &output1, &new_parsed); + output1.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str1.c_str()); + + // Same with an "invalid" path. + r.SetPath(reinterpret_cast(0x00000001), Component()); + std::string out_str2; + StdStringCanonOutput output2(&out_str2); + ReplaceStandardURL(src, parsed, r, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, + &output2, &new_parsed); + output2.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str2.c_str()); + } +} + +TEST(URLCanonTest, ReplaceFileURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, "filer", nullptr, + "/foo", "b", "c", "file://filer/foo?b#c"}, + // Replace nothing + {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"}, + {"file:///Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, "file:///Y:"}, + {"file:///Y:/", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, "file:///Y:/"}, + {"file:///./Y", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, "file:///Y"}, + {"file:///./Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, "file:///Y:"}, + // Clear non-path components (common) + {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, kDeleteComp, kDeleteComp, "file:///C:/gaba"}, + // Replace path with something that doesn't begin with a slash and make + // sure it gets added properly. + {"file:///C:/gaba", nullptr, nullptr, nullptr, nullptr, nullptr, + "interesting/", nullptr, nullptr, "file:///interesting/"}, + {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, "filer", + nullptr, "/foo", "b", "c", "file://filer/foo?b#c"}, + {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, "file:///home/gaba?query#ref"}, + {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, kDeleteComp, kDeleteComp, "file:///home/gaba"}, + {"file:///home/gaba", nullptr, nullptr, nullptr, nullptr, nullptr, + "interesting/", nullptr, nullptr, "file:///interesting/"}, + // Replace scheme -- shouldn't do anything. + {"file:///C:/gaba?query#ref", "http", nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"}, + }; + + for (size_t i = 0; i < std::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + SCOPED_TRACE(cur.base); + int base_len = static_cast(strlen(cur.base)); + Parsed parsed; + ParseFileURL(cur.base, base_len, &parsed); + + Replacements r; + typedef Replacements R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceFileURL(cur.base, parsed, r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplaceFileSystemURL) { + ReplaceCase replace_cases[] = { + // Replace everything in the outer URL. + {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr, + nullptr, nullptr, "/foo", "b", "c", + "filesystem:file:///temporary/foo?b#c"}, + // Replace nothing + {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + "filesystem:file:///temporary/gaba?query#ref"}, + // Clear non-path components (common) + {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, kDeleteComp, kDeleteComp, + "filesystem:file:///temporary/gaba"}, + // Replace path with something that doesn't begin with a slash and make + // sure it gets added properly. + {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr, + nullptr, nullptr, "interesting/", nullptr, nullptr, + "filesystem:file:///temporary/interesting/?query#ref"}, + // Replace scheme -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com/t/gaba?query#ref", "http", nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace username -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com/t/gaba?query#ref", nullptr, "u2", nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace password -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com/t/gaba?query#ref", nullptr, nullptr, + "pw2", nullptr, nullptr, nullptr, nullptr, nullptr, + "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace host -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com:80/t/gaba?query#ref", nullptr, nullptr, + nullptr, "foo.com", nullptr, nullptr, nullptr, nullptr, + "filesystem:http://bar.com/t/gaba?query#ref"}, + // Replace port -- shouldn't do anything except canonicalize. + {"filesystem:http://u:p@bar.com:40/t/gaba?query#ref", nullptr, nullptr, + nullptr, nullptr, "41", nullptr, nullptr, nullptr, + "filesystem:http://bar.com:40/t/gaba?query#ref"}, + }; + + for (size_t i = 0; i < std::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast(strlen(cur.base)); + Parsed parsed; + ParseFileSystemURL(cur.base, base_len, &parsed); + + Replacements r; + typedef Replacements R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceFileSystemURL(cur.base, parsed, r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplacePathURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr, + "alert('foo?');", nullptr, nullptr, "javascript:alert('foo?');"}, + // Replace nothing + {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, "data:foo"}, + // Replace one or the other + {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, "javascript:foo"}, + {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, "bar", nullptr, + nullptr, "data:bar"}, + {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, kDeleteComp, + nullptr, nullptr, "data:"}, + }; + + for (size_t i = 0; i < std::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast(strlen(cur.base)); + Parsed parsed; + ParsePathURL(cur.base, base_len, false, &parsed); + + Replacements r; + typedef Replacements R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplacePathURL(cur.base, parsed, r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplaceMailtoURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"}, + // Replace nothing + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"}, + // Replace the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"}, + // Replace the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"}, + // Replace the path and query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"}, + // Set the query to empty (should leave trailing question mark) + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"}, + // Clear the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"}, + // Clear the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"}, + // Clear the path + query + {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"}, + // Setting the ref should have no effect + {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"}, + }; + + for (size_t i = 0; i < std::size(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast(strlen(cur.base)); + Parsed parsed; + ParseMailtoURL(cur.base, base_len, &parsed); + + Replacements r; + typedef Replacements R; + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + StdStringCanonOutput output(&out_str); + Parsed out_parsed; + ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, CanonicalizeFileURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + Component expected_host; + Component expected_path; + } cases[] = { +#ifdef _WIN32 + // Windows-style paths + {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(), + Component(7, 16)}, + {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, + Component(), Component(7, 19)}, + {"file:", "file:///", true, Component(), Component(7, 1)}, + {"file:UNChost/path", "file://unchost/path", true, Component(7, 7), + Component(14, 5)}, + // CanonicalizeFileURL supports absolute Windows style paths for IE + // compatibility. Note that the caller must decide that this is a file + // URL itself so it can call the file canonicalizer. This is usually + // done automatically as part of relative URL resolving. + {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(), + Component(7, 11)}, + {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)}, + {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(), + Component(7, 11)}, + {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(), + Component(7, 11)}, + {"//server/file", "file://server/file", true, Component(7, 6), + Component(13, 5)}, + {"\\\\server\\file", "file://server/file", true, Component(7, 6), + Component(13, 5)}, + {"/\\server/file", "file://server/file", true, Component(7, 6), + Component(13, 5)}, + // We should preserve the number of slashes after the colon for IE + // compatibility, except when there is none, in which case we should + // add one. + {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(), + Component(7, 16)}, + {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, + Component(), Component(7, 19)}, + // Three slashes should be non-UNC, even if there is no drive spec (IE + // does this, which makes the resulting request invalid). + {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(), + Component(7, 12)}, + // TODO(brettw) we should probably fail for invalid host names, which + // would change the expected result on this test. We also currently allow + // colon even though it's probably invalid, because its currently the + // "natural" result of the way the canonicalizer is written. There doesn't + // seem to be a strong argument for why allowing it here would be bad, so + // we just tolerate it and the load will fail later. + {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, + Component(7, 2), Component(9, 16)}, + {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5), + Component(12, 8)}, + // Make sure relative paths can't go above the "C:" + {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, + Component(), Component(7, 12)}, + // Busted refs shouldn't make the whole thing fail. + {"file:///C:/asdf#\xc2", "file:///C:/asdf#%EF%BF%BD", true, Component(), + Component(7, 8)}, + {"file:///./s:", "file:///S:", true, Component(), Component(7, 3)}, +#else + // Unix-style paths + {"file:///home/me", "file:///home/me", true, Component(), + Component(7, 8)}, + // Windowsy ones should get still treated as Unix-style. + {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(), + Component(7, 16)}, + {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, + Component(), Component(7, 19)}, + {"file:///./s:", "file:///s:", true, Component(), Component(7, 3)}, + // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html) + {"//", "file:///", true, Component(), Component(7, 1)}, + {"///", "file:///", true, Component(), Component(7, 1)}, + {"///test", "file:///test", true, Component(), Component(7, 5)}, + {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)}, + {"file://localhost", "file://localhost/", true, Component(7, 9), + Component(16, 1)}, + {"file://localhost/", "file://localhost/", true, Component(7, 9), + Component(16, 1)}, + {"file://localhost/test", "file://localhost/test", true, Component(7, 9), + Component(16, 5)}, +#endif // _WIN32 + }; + + for (size_t i = 0; i < std::size(cases); i++) { + int url_len = static_cast(strlen(cases[i].input)); + Parsed parsed; + ParseFileURL(cases[i].input, url_len, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeFileURL(cases[i].input, url_len, parsed, NULL, + &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified, the file canonicalizer has + // different code for writing the spec. + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(4, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin); + EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + } +} + +TEST(URLCanonTest, CanonicalizeFileSystemURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + } cases[] = { + {"Filesystem:htTp://www.Foo.com:80/tempoRary", + "filesystem:http://www.foo.com/tempoRary/", true}, + {"filesystem:httpS://www.foo.com/temporary/", + "filesystem:https://www.foo.com/temporary/", true}, + {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//", + false}, + {"filesystem:http://www.foo.com/persistent/bob?query#ref", + "filesystem:http://www.foo.com/persistent/bob?query#ref", true}, + {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true}, + {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true}, + {"filesystem:File:///temporary/Bob?qUery#reF", + "filesystem:file:///temporary/Bob?qUery#reF", true}, + {"FilEsysteM:htTp:E=/.", "filesystem:http://e%3D//", false}, + }; + + for (size_t i = 0; i < std::size(cases); i++) { + int url_len = static_cast(strlen(cases[i].input)); + Parsed parsed; + ParseFileSystemURL(cases[i].input, url_len, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeFileSystemURL(cases[i].input, url_len, parsed, + NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified, the filesystem canonicalizer + // has different code for writing the spec. + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(10, out_parsed.scheme.len); + if (success) + EXPECT_GT(out_parsed.path.len, 0); + } +} + +TEST(URLCanonTest, CanonicalizePathURL) { + // Path URLs should get canonicalized schemes but nothing else. + struct PathCase { + const char* input; + const char* expected; + } path_cases[] = { + {"javascript:", "javascript:"}, + {"JavaScript:Foo", "javascript:Foo"}, + {"Foo:\":This /is interesting;?#", "foo:\":This /is interesting;?#"}, + + // Validation errors should not cause failure. See + // https://crbug.com/925614. + {"javascript:\uFFFF", "javascript:%EF%BF%BD"}, + }; + + for (size_t i = 0; i < std::size(path_cases); i++) { + int url_len = static_cast(strlen(path_cases[i].input)); + Parsed parsed; + ParsePathURL(path_cases[i].input, url_len, true, &parsed); + + Parsed out_parsed; + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizePathURL(path_cases[i].input, url_len, parsed, + &output, &out_parsed); + output.Complete(); + + EXPECT_TRUE(success); + EXPECT_EQ(path_cases[i].expected, out_str); + + EXPECT_EQ(0, out_parsed.host.begin); + EXPECT_EQ(-1, out_parsed.host.len); + + // When we end with a colon at the end, there should be no path. + if (path_cases[i].input[url_len - 1] == ':') { + EXPECT_EQ(0, out_parsed.GetContent().begin); + EXPECT_EQ(-1, out_parsed.GetContent().len); + } + } +} + +TEST(URLCanonTest, CanonicalizePathURLPath) { + struct PathCase { + std::string input; + std::wstring input16; + std::string expected; + } path_cases[] = { + {"Foo", L"Foo", "Foo"}, + {"\":This /is interesting;?#", L"\":This /is interesting;?#", + "\":This /is interesting;?#"}, + {"\uFFFF", L"\uFFFF", "%EF%BF%BD"}, + }; + + for (size_t i = 0; i < std::size(path_cases); i++) { + // 8-bit string input + std::string out_str; + StdStringCanonOutput output(&out_str); + url::Component out_component; + CanonicalizePathURLPath(path_cases[i].input.data(), + Component(0, path_cases[i].input.size()), &output, + &out_component); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected, out_str); + + EXPECT_EQ(0, out_component.begin); + EXPECT_EQ(path_cases[i].expected.size(), + static_cast(out_component.len)); + + // 16-bit string input + std::string out_str16; + StdStringCanonOutput output16(&out_str16); + url::Component out_component16; + std::u16string input16( + test_utils::TruncateWStringToUTF16(path_cases[i].input16.data())); + CanonicalizePathURLPath(input16.c_str(), + Component(0, path_cases[i].input16.size()), + &output16, &out_component16); + output16.Complete(); + + EXPECT_EQ(path_cases[i].expected, out_str16); + + EXPECT_EQ(0, out_component16.begin); + EXPECT_EQ(path_cases[i].expected.size(), + static_cast(out_component16.len)); + } +} + +TEST(URLCanonTest, CanonicalizeMailtoURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + Component expected_path; + Component expected_query; + } cases[] = { + // Null character should be escaped to %00. + // Keep this test first in the list as it is handled specially below. + {"mailto:addr1\0addr2?foo", + "mailto:addr1%00addr2?foo", + true, Component(7, 13), Component(21, 3)}, + {"mailto:addr1", + "mailto:addr1", + true, Component(7, 5), Component()}, + {"mailto:addr1@foo.com", + "mailto:addr1@foo.com", + true, Component(7, 13), Component()}, + // Trailing whitespace is stripped. + {"MaIlTo:addr1 \t ", + "mailto:addr1", + true, Component(7, 5), Component()}, + {"MaIlTo:addr1?to=jon", + "mailto:addr1?to=jon", + true, Component(7, 5), Component(13,6)}, + {"mailto:addr1,addr2", + "mailto:addr1,addr2", + true, Component(7, 11), Component()}, + // Embedded spaces must be encoded. + {"mailto:addr1, addr2", + "mailto:addr1,%20addr2", + true, Component(7, 14), Component()}, + {"mailto:addr1, addr2?subject=one two ", + "mailto:addr1,%20addr2?subject=one%20two", + true, Component(7, 14), Component(22, 17)}, + {"mailto:addr1%2caddr2", + "mailto:addr1%2caddr2", + true, Component(7, 13), Component()}, + {"mailto:\xF0\x90\x8C\x80", + "mailto:%F0%90%8C%80", + true, Component(7, 12), Component()}, + // Invalid -- UTF-8 encoded surrogate value. + {"mailto:\xed\xa0\x80", + "mailto:%EF%BF%BD%EF%BF%BD%EF%BF%BD", + false, Component(7, 27), Component()}, + {"mailto:addr1?", + "mailto:addr1?", + true, Component(7, 5), Component(13, 0)}, + // Certain characters have special meanings and must be encoded. + {"mailto:! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~\x7f?Query! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~", + "mailto:!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_%60az%7B%7C%7D~%7F?Query!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_`az{|}~", + true, Component(7, 53), Component(61, 47)}, + }; + + // Define outside of loop to catch bugs where components aren't reset + Parsed parsed; + Parsed out_parsed; + + for (size_t i = 0; i < std::size(cases); i++) { + int url_len = static_cast(strlen(cases[i].input)); + if (i == 0) { + // The first test case purposely has a '\0' in it -- don't count it + // as the string terminator. + url_len = 22; + } + ParseMailtoURL(cases[i].input, url_len, &parsed); + + std::string out_str; + StdStringCanonOutput output(&out_str); + bool success = CanonicalizeMailtoURL(cases[i].input, url_len, parsed, + &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(6, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + + EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin); + EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len); + } +} + +#ifndef WIN32 + +TEST(URLCanonTest, _itoa_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + char buf[6]; + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + // Test the edge cases - exactly the buffer size and one over + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("1234", buf); + EXPECT_EQ('\xFF', buf[5]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10)); + EXPECT_EQ('\xFF', buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(12, buf, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(12345, buf, 10)); + EXPECT_STREQ("12345", buf); + + EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10)); + + // Test that radix 16 is supported. + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16)); + EXPECT_STREQ("4d2", buf); + EXPECT_EQ('\xFF', buf[5]); +} + +TEST(URLCanonTest, _itow_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + char16_t buf[6]; + const char fill_mem = 0xff; + const char16_t fill_char = 0xffff; + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(u"12", std::u16string(buf)); + EXPECT_EQ(fill_char, buf[3]); + + // Test the edge cases - exactly the buffer size and one over + EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(u"1234", std::u16string(buf)); + EXPECT_EQ(fill_char, buf[5]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(fill_char, buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, _itow_s(12, buf, 10)); + EXPECT_EQ(u"12", std::u16string(buf)); + EXPECT_EQ(fill_char, buf[3]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, _itow_s(12345, buf, 10)); + EXPECT_EQ(u"12345", std::u16string(buf)); + + EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10)); +} + +#endif // !WIN32 + +// Returns true if the given two structures are the same. +static bool ParsedIsEqual(const Parsed& a, const Parsed& b) { + return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len && + a.username.begin == b.username.begin && a.username.len == b.username.len && + a.password.begin == b.password.begin && a.password.len == b.password.len && + a.host.begin == b.host.begin && a.host.len == b.host.len && + a.port.begin == b.port.begin && a.port.len == b.port.len && + a.path.begin == b.path.begin && a.path.len == b.path.len && + a.query.begin == b.query.begin && a.query.len == b.query.len && + a.ref.begin == b.ref.begin && a.ref.len == b.ref.len; +} + +TEST(URLCanonTest, ResolveRelativeURL) { + struct RelativeCase { + const char* base; // Input base URL: MUST BE CANONICAL + bool is_base_hier; // Is the base URL hierarchical + bool is_base_file; // Tells us if the base is a file URL. + const char* test; // Input URL to test against. + bool succeed_relative; // Whether we expect IsRelativeURL to succeed + bool is_rel; // Whether we expect |test| to be relative or not. + bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed. + const char* resolved; // What we expect in the result when resolving. + } rel_cases[] = { + // Basic absolute input. + {"http://host/a", true, false, "http://another/", true, false, false, NULL}, + {"http://host/a", true, false, "http:////another/", true, false, false, NULL}, + // Empty relative URLs should only remove the ref part of the URL, + // leaving the rest unchanged. + {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"}, + {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"}, + {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"}, + // Spaces at the ends of the relative path should be ignored. + {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"}, + {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"}, + {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"}, + // Matching schemes without two slashes are treated as relative. + {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"}, + {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"}, + {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"}, + {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"}, + // Nonmatching schemes are absolute. + {"http://host/a", true, false, "https:host2", true, false, false, NULL}, + {"http://host/a", true, false, "htto:/host2", true, false, false, NULL}, + // Absolute path input + {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"}, + {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"}, + // Relative path input + {"http://host/a", true, false, "b", true, true, true, "http://host/b"}, + {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"}, + {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"}, + {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"}, + {"http://host/a/", true, false, "..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "./..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "../.", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"}, + {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"}, + // Query input + {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"}, + {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"}, + {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"}, + // Ref input + {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"}, + {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"}, + {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"}, + // Non-hierarchical base: no relative handling. Relative input should + // error, and if a scheme is present, it should be treated as absolute. + {"data:foobar", false, false, "baz.html", false, false, false, NULL}, + {"data:foobar", false, false, "data:baz", true, false, false, NULL}, + {"data:foobar", false, false, "data:/base", true, false, false, NULL}, + // Non-hierarchical base: absolute input should succeed. + {"data:foobar", false, false, "http://host/", true, false, false, NULL}, + {"data:foobar", false, false, "http:host", true, false, false, NULL}, + // Non-hierarchical base: empty URL should give error. + {"data:foobar", false, false, "", false, false, false, NULL}, + // Invalid schemes should be treated as relative. + {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"}, + {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"}, + {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"}, + {"data:asdf", false, false, ":foo", false, false, false, NULL}, + {"data:asdf", false, false, "bad(':foo')", false, false, false, NULL}, + // We should treat semicolons like any other character in URL resolving + {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"}, + // Relative URLs can also be written as "//foo/bar" which is relative to + // the scheme. In this case, it would take the old scheme, so for http + // the example would resolve to "http://foo/bar". + {"http://host/a", true, false, "//another", true, true, true, "http://another/"}, + {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"}, + {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//", true, true, false, "http:"}, + // IE will also allow one or the other to be a backslash to get the same + // behavior. + {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"}, +#ifdef WIN32 + // Resolving against Windows file base URLs. + {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL}, + {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"}, + {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"}, + {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"}, + // But two backslashes on Windows should be UNC so should be treated + // as absolute. + {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL}, + // IE doesn't support drive specs starting with two slashes. It fails + // immediately and doesn't even try to load. We fix it up to either + // an absolute path or UNC depending on what it looks like. + {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"}, + {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"}, + // Windows drive specs should be allowed and treated as absolute. + {"file:///C:/foo", true, true, "c:", true, false, false, NULL}, + {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL}, + {"http://host/a", true, false, "c:\\foo", true, false, false, NULL}, + // Relative paths with drive letters should be allowed when the base is + // also a file. + {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"}, + // Treat absolute paths as being off of the drive. + {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"}, + {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"}, + {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"}, + // On Windows, two slashes without a drive letter when the base is a file + // means that the path is UNC. + {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"}, + {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"}, +#else + // On Unix we fall back to relative behavior since there's nothing else + // reasonable to do. + {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"}, +#endif + // Even on Windows, we don't allow relative drive specs when the base + // is not file. + {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"}, + {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"}, + // Cross-platform relative file: resolution behavior. + {"file://host/a", true, true, "/", true, true, true, "file://host/"}, + {"file://host/a", true, true, "//", true, true, true, "file:///"}, + {"file://host/a", true, true, "/b", true, true, true, "file://host/b"}, + {"file://host/a", true, true, "//b", true, true, true, "file://b/"}, + // Ensure that ports aren't allowed for hosts relative to a file url. + // Although the result string shows a host:port portion, the call to + // resolve the relative URL returns false, indicating parse failure, + // which is what is required. + {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"}, + // Filesystem URL tests; filesystem URLs are only valid and relative if + // they have no scheme, e.g. "./index.html". There's no valid equivalent + // to http:index.html. + {"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL}, + {"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL}, + {"filesystem:http://host/t/path", true, false, "http://host/t/path2", true, false, false, NULL}, + {"http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL}, + {"filesystem:http://host/t/path", true, false, "./path2", true, true, true, "filesystem:http://host/t/path2"}, + {"filesystem:http://host/t/path/", true, false, "path2", true, true, true, "filesystem:http://host/t/path/path2"}, + {"filesystem:http://host/t/path", true, false, "filesystem:http:path2", true, false, false, NULL}, + // Absolute URLs are still not relative to a non-standard base URL. + {"about:blank", false, false, "http://X/A", true, false, true, ""}, + {"about:blank", false, false, "content://content.Provider/", true, false, true, ""}, + }; + + for (size_t i = 0; i < std::size(rel_cases); i++) { + const RelativeCase& cur_case = rel_cases[i]; + + Parsed parsed; + int base_len = static_cast(strlen(cur_case.base)); + if (cur_case.is_base_file) + ParseFileURL(cur_case.base, base_len, &parsed); + else if (cur_case.is_base_hier) + ParseStandardURL(cur_case.base, base_len, &parsed); + else + ParsePathURL(cur_case.base, base_len, false, &parsed); + + // First see if it is relative. + int test_len = static_cast(strlen(cur_case.test)); + bool is_relative; + Component relative_component; + bool succeed_is_rel = IsRelativeURL( + cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier, + &is_relative, &relative_component); + + EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) << + "succeed is rel failure on " << cur_case.test; + EXPECT_EQ(cur_case.is_rel, is_relative) << + "is rel failure on " << cur_case.test; + // Now resolve it. + if (succeed_is_rel && is_relative && cur_case.is_rel) { + std::string resolved; + StdStringCanonOutput output(&resolved); + Parsed resolved_parsed; + + bool succeed_resolve = ResolveRelativeURL( + cur_case.base, parsed, cur_case.is_base_file, cur_case.test, + relative_component, NULL, &output, &resolved_parsed); + output.Complete(); + + EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve); + EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test; + + // Verify that the output parsed structure is the same as parsing a + // the URL freshly. + Parsed ref_parsed; + int resolved_len = static_cast(resolved.size()); + if (cur_case.is_base_file) { + ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed); + } else if (cur_case.is_base_hier) { + ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed); + } else { + ParsePathURL(resolved.c_str(), resolved_len, false, &ref_parsed); + } + EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed)); + } + } +} + +// It used to be the case that when we did a replacement with a long buffer of +// UTF-16 characters, we would get invalid data in the URL. This is because the +// buffer that it used to hold the UTF-8 data was resized, while some pointers +// were still kept to the old buffer that was removed. +TEST(URLCanonTest, ReplacementOverflow) { + const char src[] = "file:///C:/foo/bar"; + int src_len = static_cast(strlen(src)); + Parsed parsed; + ParseFileURL(src, src_len, &parsed); + + // Override two components, the path with something short, and the query with + // something long enough to trigger the bug. + Replacements repl; + std::u16string new_query; + for (int i = 0; i < 4800; i++) + new_query.push_back('a'); + + std::u16string new_path(test_utils::TruncateWStringToUTF16(L"/foo")); + repl.SetPath(new_path.c_str(), Component(0, 4)); + repl.SetQuery(new_query.c_str(), + Component(0, static_cast(new_query.length()))); + + // Call ReplaceComponents on the string. It doesn't matter if we call it for + // standard URLs, file URLs, etc, since they will go to the same replacement + // function that was buggy. + Parsed repl_parsed; + std::string repl_str; + StdStringCanonOutput repl_output(&repl_str); + ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed); + repl_output.Complete(); + + // Generate the expected string and check. + std::string expected("file:///foo?"); + for (size_t i = 0; i < new_query.length(); i++) + expected.push_back('a'); + EXPECT_TRUE(expected == repl_str); +} + +TEST(URLCanonTest, DefaultPortForScheme) { + struct TestCases { + const char* scheme; + const int expected_port; + } cases[]{ + {"http", 80}, + {"https", 443}, + {"ftp", 21}, + {"ws", 80}, + {"wss", 443}, + {"fake-scheme", PORT_UNSPECIFIED}, + {"HTTP", PORT_UNSPECIFIED}, + {"HTTPS", PORT_UNSPECIFIED}, + {"FTP", PORT_UNSPECIFIED}, + {"WS", PORT_UNSPECIFIED}, + {"WSS", PORT_UNSPECIFIED}, + }; + + for (auto& test_case : cases) { + SCOPED_TRACE(test_case.scheme); + EXPECT_EQ(test_case.expected_port, + DefaultPortForScheme(test_case.scheme, strlen(test_case.scheme))); + } +} + +TEST(URLCanonTest, FindWindowsDriveLetter) { + struct TestCase { + base::StringPiece spec; + int begin; + int end; // -1 for end of spec + int expected_drive_letter_pos; + } cases[] = { + {"/", 0, -1, -1}, + + {"c:/foo", 0, -1, 0}, + {"/c:/foo", 0, -1, 1}, + {"//c:/foo", 0, -1, -1}, // "//" does not canonicalize to "/" + {"\\C|\\foo", 0, -1, 1}, + {"/cd:/foo", 0, -1, -1}, // "/c" does not canonicalize to "/" + {"/./c:/foo", 0, -1, 3}, + {"/.//c:/foo", 0, -1, -1}, // "/.//" does not canonicalize to "/" + {"/././c:/foo", 0, -1, 5}, + {"/abc/c:/foo", 0, -1, -1}, // "/abc/" does not canonicalize to "/" + {"/abc/./../c:/foo", 0, -1, 10}, + + {"/c:/c:/foo", 3, -1, 4}, // actual input is "/c:/foo" + {"/c:/foo", 3, -1, -1}, // actual input is "/foo" + {"/c:/foo", 0, 1, -1}, // actual input is "/" + }; + + for (const auto& c : cases) { + int end = c.end; + if (end == -1) + end = c.spec.size(); + + EXPECT_EQ(c.expected_drive_letter_pos, + FindWindowsDriveLetter(c.spec.data(), c.begin, end)) + << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-8)"; + + std::u16string spec16 = base::ASCIIToUTF16(c.spec); + EXPECT_EQ(c.expected_drive_letter_pos, + FindWindowsDriveLetter(spec16.data(), c.begin, end)) + << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-16)"; + } +} + +TEST(URLCanonTest, IDNToASCII) { + RawCanonOutputW<1024> output; + + // Basic ASCII test. + std::u16string str = u"hello"; + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(u"hello", std::u16string(output.data())); + output.set_length(0); + + // Mixed ASCII/non-ASCII. + str = u"hellö"; + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data())); + output.set_length(0); + + // All non-ASCII. + str = u"你好"; + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(u"xn--6qq79v", std::u16string(output.data())); + output.set_length(0); + + // Characters that need mapping (the resulting Punycode is the encoding for + // "1⁄4"). + str = u"¼"; + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(u"xn--14-c6t", std::u16string(output.data())); + output.set_length(0); + + // String to encode already starts with "xn--", and all ASCII. Should not + // modify the string. + str = u"xn--hell-8qa"; + EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output)); + EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data())); + output.set_length(0); + + // String to encode already starts with "xn--", and mixed ASCII/non-ASCII. + // Should fail, due to a special case: if the label starts with "xn--", it + // should be parsed as Punycode, which must be all ASCII. + str = u"xn--hellö"; + EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output)); + output.set_length(0); + + // String to encode already starts with "xn--", and mixed ASCII/non-ASCII. + // This tests that there is still an error for the character '⁄' (U+2044), + // which would be a valid ASCII character, U+0044, if the high byte were + // ignored. + str = u"xn--1⁄4"; + EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output)); + output.set_length(0); +} + +} // namespace url diff --git a/url_constants.cc b/url_constants.cc new file mode 100644 index 00000000000..850a31ce22d --- /dev/null +++ b/url_constants.cc @@ -0,0 +1,61 @@ +// Copyright 2014 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_constants.h" + +namespace url { + +const char kAboutBlankURL[] = "about:blank"; +const char16_t kAboutBlankURL16[] = u"about:blank"; +const char kAboutSrcdocURL[] = "about:srcdoc"; +const char16_t kAboutSrcdocURL16[] = u"about:srcdoc"; + +const char kAboutBlankPath[] = "blank"; +const char16_t kAboutBlankPath16[] = u"blank"; +const char kAboutSrcdocPath[] = "srcdoc"; +const char16_t kAboutSrcdocPath16[] = u"srcdoc"; + +const char kAboutScheme[] = "about"; +const char16_t kAboutScheme16[] = u"about"; +const char kBlobScheme[] = "blob"; +const char16_t kBlobScheme16[] = u"blob"; +const char kContentScheme[] = "content"; +const char16_t kContentScheme16[] = u"content"; +const char kContentIDScheme[] = "cid"; +const char16_t kContentIDScheme16[] = u"cid"; +const char kDataScheme[] = "data"; +const char16_t kDataScheme16[] = u"data"; +const char kFileScheme[] = "file"; +const char16_t kFileScheme16[] = u"file"; +const char kFileSystemScheme[] = "filesystem"; +const char16_t kFileSystemScheme16[] = u"filesystem"; +const char kFtpScheme[] = "ftp"; +const char16_t kFtpScheme16[] = u"ftp"; +const char kHttpScheme[] = "http"; +const char16_t kHttpScheme16[] = u"http"; +const char kHttpsScheme[] = "https"; +const char16_t kHttpsScheme16[] = u"https"; +const char kJavaScriptScheme[] = "javascript"; +const char16_t kJavaScriptScheme16[] = u"javascript"; +const char kMailToScheme[] = "mailto"; +const char16_t kMailToScheme16[] = u"mailto"; +const char kTelScheme[] = "tel"; +const char16_t kTelScheme16[] = u"tel"; +const char kUrnScheme[] = "urn"; +const char16_t kUrnScheme16[] = u"urn"; +const char kUuidInPackageScheme[] = "uuid-in-package"; +const char16_t kUuidInPackageScheme16[] = u"uuid-in-package"; +const char kWebcalScheme[] = "webcal"; +const char16_t kWebcalScheme16[] = u"webcal"; +const char kWsScheme[] = "ws"; +const char16_t kWsScheme16[] = u"ws"; +const char kWssScheme[] = "wss"; +const char16_t kWssScheme16[] = u"wss"; + +const char kStandardSchemeSeparator[] = "://"; +const char16_t kStandardSchemeSeparator16[] = u"://"; + +const size_t kMaxURLChars = 2 * 1024 * 1024; + +} // namespace url diff --git a/url_constants.h b/url_constants.h new file mode 100644 index 00000000000..5eda4e89f25 --- /dev/null +++ b/url_constants.h @@ -0,0 +1,70 @@ +// Copyright 2014 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_CONSTANTS_H_ +#define URL_URL_CONSTANTS_H_ + +#include + +#include "base/component_export.h" + +namespace url { + +COMPONENT_EXPORT(URL) extern const char kAboutBlankURL[]; +COMPONENT_EXPORT(URL) extern const char16_t kAboutBlankURL16[]; +COMPONENT_EXPORT(URL) extern const char kAboutSrcdocURL[]; +COMPONENT_EXPORT(URL) extern const char16_t kAboutSrcdocURL16[]; + +COMPONENT_EXPORT(URL) extern const char kAboutBlankPath[]; +COMPONENT_EXPORT(URL) extern const char16_t kAboutBlankPath16[]; +COMPONENT_EXPORT(URL) extern const char kAboutSrcdocPath[]; +COMPONENT_EXPORT(URL) extern const char16_t kAboutSrcdocPath16[]; + +COMPONENT_EXPORT(URL) extern const char kAboutScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kAboutScheme16[]; +COMPONENT_EXPORT(URL) extern const char kBlobScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kBlobScheme16[]; +// The content scheme is specific to Android for identifying a stored file. +COMPONENT_EXPORT(URL) extern const char kContentScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kContentScheme16[]; +COMPONENT_EXPORT(URL) extern const char kContentIDScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kContentIDScheme16[]; +COMPONENT_EXPORT(URL) extern const char kDataScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kDataScheme16[]; +COMPONENT_EXPORT(URL) extern const char kFileScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kFileScheme16[]; +COMPONENT_EXPORT(URL) extern const char kFileSystemScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kFileSystemScheme16[]; +COMPONENT_EXPORT(URL) extern const char kFtpScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kFtpScheme16[]; +COMPONENT_EXPORT(URL) extern const char kHttpScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kHttpScheme16[]; +COMPONENT_EXPORT(URL) extern const char kHttpsScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kHttpsScheme16[]; +COMPONENT_EXPORT(URL) extern const char kJavaScriptScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kJavaScriptScheme16[]; +COMPONENT_EXPORT(URL) extern const char kMailToScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kMailToScheme16[]; +COMPONENT_EXPORT(URL) extern const char kTelScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kTelScheme16[]; +COMPONENT_EXPORT(URL) extern const char kUrnScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kUrnScheme16[]; +COMPONENT_EXPORT(URL) extern const char kUuidInPackageScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kUuidInPackageScheme16[]; +COMPONENT_EXPORT(URL) extern const char kWebcalScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kWebcalScheme16[]; +COMPONENT_EXPORT(URL) extern const char kWsScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kWsScheme16[]; +COMPONENT_EXPORT(URL) extern const char kWssScheme[]; +COMPONENT_EXPORT(URL) extern const char16_t kWssScheme16[]; + +// Used to separate a standard scheme and the hostname: "://". +COMPONENT_EXPORT(URL) extern const char kStandardSchemeSeparator[]; +COMPONENT_EXPORT(URL) extern const char16_t kStandardSchemeSeparator16[]; + +COMPONENT_EXPORT(URL) extern const size_t kMaxURLChars; + +} // namespace url + +#endif // URL_URL_CONSTANTS_H_ diff --git a/url_features.cc b/url_features.cc new file mode 100644 index 00000000000..8f38ff257a8 --- /dev/null +++ b/url_features.cc @@ -0,0 +1,35 @@ +// Copyright 2022 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_features.h" + +namespace url { + +BASE_FEATURE(kUseIDNA2008NonTransitional, + "UseIDNA2008NonTransitional", + base::FEATURE_ENABLED_BY_DEFAULT); + +// Kill switch for crbug.com/1362507. +BASE_FEATURE(kRecordIDNA2008Metrics, + "RecordIDNA2008Metrics", + base::FEATURE_ENABLED_BY_DEFAULT); + +BASE_FEATURE(kStrictIPv4EmbeddedIPv6AddressParsing, + "StrictIPv4EmbeddedIPv6AddressParsing", + base::FEATURE_DISABLED_BY_DEFAULT); + +// Kill switch for crbug.com/1220361. +BASE_FEATURE(kResolveBareFragmentWithColonOnNonHierarchical, + "ResolveBareFragmentWithColonOnNonHierarchical", + base::FEATURE_ENABLED_BY_DEFAULT); + +bool IsUsingIDNA2008NonTransitional() { + return base::FeatureList::IsEnabled(kUseIDNA2008NonTransitional); +} + +bool IsRecordingIDNA2008Metrics() { + return base::FeatureList::IsEnabled(kRecordIDNA2008Metrics); +} + +} // namespace url diff --git a/url_features.h b/url_features.h new file mode 100644 index 00000000000..e95752141f3 --- /dev/null +++ b/url_features.h @@ -0,0 +1,33 @@ +// Copyright 2022 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_FEATURES_H_ +#define URL_URL_FEATURES_H_ + +#include "base/component_export.h" +#include "base/feature_list.h" + +namespace url { + +COMPONENT_EXPORT(URL) BASE_DECLARE_FEATURE(kUseIDNA2008NonTransitional); + +// Returns true if Chrome is using IDNA 2008 in Non-Transitional mode. +COMPONENT_EXPORT(URL) bool IsUsingIDNA2008NonTransitional(); + +// Returns true if Chrome is recording IDNA 2008 related metrics. +COMPONENT_EXPORT(URL) bool IsRecordingIDNA2008Metrics(); + +// Returns true if Chrome is enforcing the 4 part check for IPv4 embedded IPv6 +// addresses. +COMPONENT_EXPORT(URL) +BASE_DECLARE_FEATURE(kStrictIPv4EmbeddedIPv6AddressParsing); + +// When enabled, allows resolving of a bare fragment containing a colon against +// a non-hierarchical URL. (For example '#foo:bar' against 'about:blank'.) +COMPONENT_EXPORT(URL) +BASE_DECLARE_FEATURE(kResolveBareFragmentWithColonOnNonHierarchical); + +} // namespace url + +#endif // URL_URL_FEATURES_H_ diff --git a/url_file.h b/url_file.h new file mode 100644 index 00000000000..65ce98ac404 --- /dev/null +++ b/url_file.h @@ -0,0 +1,101 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_FILE_H_ +#define URL_URL_FILE_H_ + +// Provides shared functions used by the internals of the parser and +// canonicalizer for file URLs. Do not use outside of these modules. + +#include "base/strings/string_util.h" +#include "url/url_parse_internal.h" + +namespace url { + +// We allow both "c:" and "c|" as drive identifiers. +inline bool IsWindowsDriveSeparator(char16_t ch) { + return ch == ':' || ch == '|'; +} +inline bool IsWindowsDriveSeparator(char ch) { + return IsWindowsDriveSeparator(static_cast(ch)); +} + +// Returns the index of the next slash in the input after the given index, or +// spec_len if the end of the input is reached. +template +inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) { + int idx = begin_index; + while (idx < spec_len && !IsURLSlash(spec[idx])) + idx++; + return idx; +} + +// DoesContainWindowsDriveSpecUntil returns the least number between +// start_offset and max_offset such that the spec has a valid drive +// specification starting at that offset. Otherwise it returns -1. This function +// gracefully handles, by returning -1, start_offset values that are equal to or +// larger than the spec_len, and caps max_offset appropriately to simplify +// callers. max_offset must be at least start_offset. +template +inline int DoesContainWindowsDriveSpecUntil(const CHAR* spec, + int start_offset, + int max_offset, + int spec_len) { + CHECK_LE(start_offset, max_offset); + if (start_offset > spec_len - 2) + return -1; // Not enough room. + if (max_offset > spec_len - 2) + max_offset = spec_len - 2; + for (int offset = start_offset; offset <= max_offset; ++offset) { + if (!base::IsAsciiAlpha(spec[offset])) + continue; // Doesn't contain a valid drive letter. + if (!IsWindowsDriveSeparator(spec[offset + 1])) + continue; // Isn't followed with a drive separator. + return offset; + } + return -1; +} + +// Returns true if the start_offset in the given spec looks like it begins a +// drive spec, for example "c:". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// If this returns true, the spec is guaranteed to have a valid drive letter +// plus a drive letter separator (a colon or a pipe) starting at |start_offset|. +template +inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, + int start_offset, + int spec_len) { + return DoesContainWindowsDriveSpecUntil(spec, start_offset, start_offset, + spec_len) == start_offset; +} + +#ifdef WIN32 + +// Returns true if the start_offset in the given text looks like it begins a +// UNC path, for example "\\". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// When strict_slashes is set, this function will only accept backslashes as is +// standard for Windows. Otherwise, it will accept forward slashes as well +// which we use for a lot of URL handling. +template +inline bool DoesBeginUNCPath(const CHAR* text, + int start_offset, + int len, + bool strict_slashes) { + int remaining_len = len - start_offset; + if (remaining_len < 2) + return false; + + if (strict_slashes) + return text[start_offset] == '\\' && text[start_offset + 1] == '\\'; + return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]); +} + +#endif // WIN32 + +} // namespace url + +#endif // URL_URL_FILE_H_ diff --git a/url_idna_icu.cc b/url_idna_icu.cc new file mode 100644 index 00000000000..0a552a88013 --- /dev/null +++ b/url_idna_icu.cc @@ -0,0 +1,144 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// ICU-based IDNA converter. + +#include +#include +#include + +#include + +#include "base/check_op.h" +#include "third_party/icu/source/common/unicode/uidna.h" +#include "third_party/icu/source/common/unicode/utypes.h" +#include "url/url_canon_icu.h" +#include "url/url_canon_internal.h" // for _itoa_s +#include "url/url_features.h" + +namespace url { + +namespace { + +// Use UIDNA, a C pointer to a UTS46/IDNA 2008 handling object opened with +// uidna_openUTS46(). +// +// We use UTS46 with BiDiCheck to migrate from IDNA 2003 (with unassigned +// code points allowed) to IDNA 2008 with the backward compatibility in mind. +// What it does: +// +// 1. Use the up-to-date Unicode data. +// 2. Define a case folding/mapping with the up-to-date Unicode data as +// in IDNA 2003. +// 3. If `use_idna_non_transitional` is true, use non-transitional mechanism for +// 4 deviation characters (sharp-s, final sigma, ZWJ and ZWNJ) per +// url.spec.whatwg.org. +// 4. Continue to allow symbols and punctuations. +// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules. +// 6. Do not apply STD3 rules +// 7. Do not allow unassigned code points. +// +// It also closely matches what IE 10 does except for the BiDi check ( +// http://goo.gl/3XBhqw ). +// See http://http://unicode.org/reports/tr46/ and references therein +// for more details. +UIDNA* CreateIDNA(bool use_idna_non_transitional) { + uint32_t options = UIDNA_CHECK_BIDI; + if (use_idna_non_transitional) { + // Use non-transitional processing if enabled. See + // https://url.spec.whatwg.org/#idna for details. + options |= + UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE; + } + UErrorCode err = U_ZERO_ERROR; + UIDNA* idna = uidna_openUTS46(options, &err); + if (U_FAILURE(err)) { + CHECK(false) << "failed to open UTS46 data with error: " << u_errorName(err) + << ". If you see this error message in a test environment " + << "your test environment likely lacks the required data " + << "tables for libicu. See https://crbug.com/778929."; + idna = nullptr; + } + return idna; +} + +UIDNA* GetUIDNA() { + // This logic results in having two UIDNA instances in tests. This is okay. + if (IsUsingIDNA2008NonTransitional()) { + static UIDNA* uidna = CreateIDNA(/*use_idna_non_transitional=*/true); + return uidna; + } else { + static UIDNA* uidna = CreateIDNA(/*use_idna_non_transitional=*/false); + return uidna; + } +} + +} // namespace + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must be ASCII, but is represented as wide characters. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, this will return false. The output in this case is undefined. +// TODO(jungshik): use UTF-8/ASCII version of nameToASCII. +// Change the function signature and callers accordingly to avoid unnecessary +// conversions in our code. In addition, consider using icu::IDNA's UTF-8/ASCII +// version with StringByteSink. That way, we can avoid C wrappers and additional +// string conversion. +bool IDNToASCII(const char16_t* src, int src_len, CanonOutputW* output) { + DCHECK(output->length() == 0); // Output buffer is assumed empty. + + UIDNA* uidna = GetUIDNA(); + DCHECK(uidna != nullptr); + while (true) { + UErrorCode err = U_ZERO_ERROR; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + int output_length = uidna_nameToASCII(uidna, src, src_len, output->data(), + output->capacity(), &info, &err); + + // Ignore various errors for web compatibility. The options are specified + // by the WHATWG URL Standard. See + // - https://unicode.org/reports/tr46/ + // - https://url.spec.whatwg.org/#concept-domain-to-ascii + // (we set beStrict to false) + + // Disable the "CheckHyphens" option in UTS #46. See + // - https://crbug.com/804688 + // - https://github.com/whatwg/url/issues/267 + info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; + info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; + info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; + + // Disable the "VerifyDnsLength" option in UTS #46. + info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; + info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; + info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; + + if (U_SUCCESS(err) && info.errors == 0) { + // Per WHATWG URL, it is a failure if the ToASCII output is empty. + // + // ICU would usually return UIDNA_ERROR_EMPTY_LABEL in this case, but we + // want to continue allowing http://abc..def/ while forbidding http:///. + // + if (output_length == 0) { + return false; + } + + output->set_length(output_length); + return true; + } + + if (err != U_BUFFER_OVERFLOW_ERROR || info.errors != 0) + return false; // Unknown error, give up. + + // Not enough room in our buffer, expand. + output->Resize(output_length); + } +} + +} // namespace url diff --git a/url_idna_icu_alternatives_android.cc b/url_idna_icu_alternatives_android.cc new file mode 100644 index 00000000000..9faf5710f90 --- /dev/null +++ b/url_idna_icu_alternatives_android.cc @@ -0,0 +1,40 @@ +// Copyright 2014 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include + +#include "base/android/jni_android.h" +#include "base/android/jni_string.h" +#include "base/strings/string_piece.h" +#include "url/url_canon_internal.h" +#include "url/url_jni_headers/IDNStringUtil_jni.h" + +using base::android::ScopedJavaLocalRef; + +namespace url { + +// This uses the JDK's conversion function, which uses IDNA 2003, unlike the +// ICU implementation. +bool IDNToASCII(const char16_t* src, int src_len, CanonOutputW* output) { + DCHECK_EQ(0u, output->length()); // Output buffer is assumed empty. + + JNIEnv* env = base::android::AttachCurrentThread(); + base::android::ScopedJavaLocalRef java_src = + base::android::ConvertUTF16ToJavaString( + env, base::StringPiece16(src, src_len)); + ScopedJavaLocalRef java_result = + android::Java_IDNStringUtil_idnToASCII(env, java_src); + // NULL indicates failure. + if (java_result.is_null()) + return false; + + std::u16string utf16_result = + base::android::ConvertJavaStringToUTF16(java_result); + output->Append(utf16_result.data(), utf16_result.size()); + return true; +} + +} // namespace url diff --git a/url_idna_icu_alternatives_ios.mm b/url_idna_icu_alternatives_ios.mm new file mode 100644 index 00000000000..d604b351632 --- /dev/null +++ b/url_idna_icu_alternatives_ios.mm @@ -0,0 +1,28 @@ +// Copyright 2016 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include +#include + +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversions.h" +#include "url/url_canon_internal.h" + +namespace url { + +// Only allow ASCII to avoid ICU dependency. Use NSString+IDN +// to convert non-ASCII URL prior to passing to API. +bool IDNToASCII(const char16_t* src, int src_len, CanonOutputW* output) { + if (base::IsStringASCII(base::StringPiece16(src, src_len))) { + output->Append(src, src_len); + return true; + } + DCHECK(false) << "IDN URL support is not available."; + return false; +} + +} // namespace url diff --git a/url_parse_file.cc b/url_parse_file.cc new file mode 100644 index 00000000000..979ec82b97a --- /dev/null +++ b/url_parse_file.cc @@ -0,0 +1,198 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/check.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_file.h" +#include "url/url_parse_internal.h" + +// Interesting IE file:isms... +// +// INPUT OUTPUT +// ========================= ============================== +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// When on a file: URL source page, this link will work. When over HTTP, +// the file: URL will appear in the status bar but the link will not work +// (security restriction for all file URLs). +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid, seems to be a file) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace url { + +namespace { + +// A subcomponent of DoParseFileURL, the input of this function should be a UNC +// path name, with the index of the first character after the slashes following +// the scheme given in |after_slashes|. This will initialize the host, path, +// query, and ref, and leave the other output components untouched +// (DoParseFileURL handles these for us). +template +void DoParseUNC(const CHAR* spec, + int after_slashes, + int spec_len, + Parsed* parsed) { + int next_slash = FindNextSlash(spec, after_slashes, spec_len); + + // Everything up until that first slash we found (or end of string) is the + // host name, which will end up being the UNC host. For example, + // "file://foo/bar.txt" will get a server name of "foo" and a path of "/bar". + // Later, on Windows, this should be treated as the filename "\\foo\bar.txt" + // in proper UNC notation. + if (after_slashes < next_slash) + parsed->host = MakeRange(after_slashes, next_slash); + else + parsed->host.reset(); + if (next_slash < spec_len) { + ParsePathInternal(spec, MakeRange(next_slash, spec_len), + &parsed->path, &parsed->query, &parsed->ref); + } else { + parsed->path.reset(); + } +} + +// A subcomponent of DoParseFileURL, the input should be a local file, with the +// beginning of the path indicated by the index in |path_begin|. This will +// initialize the host, path, query, and ref, and leave the other output +// components untouched (DoParseFileURL handles these for us). +template +void DoParseLocalFile(const CHAR* spec, + int path_begin, + int spec_len, + Parsed* parsed) { + parsed->host.reset(); + ParsePathInternal(spec, MakeRange(path_begin, spec_len), + &parsed->path, &parsed->query, &parsed->ref); +} + +// Backend for the external functions that operates on either char type. +// Handles cases where there is a scheme, but also when handed the first +// character following the "file:" at the beginning of the spec. If so, +// this is usually a slash, but needn't be; we allow paths like "file:c:\foo". +template +void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the parts we never use for file URLs out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->port.reset(); + + // Many of the code paths don't set these, so it's convenient to just clear + // them. We'll write them in those cases we need them. + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Find the scheme, if any. + int num_slashes = CountConsecutiveSlashes(spec, begin, spec_len); + int after_scheme; + int after_slashes; +#ifdef WIN32 + // See how many slashes there are. We want to handle cases like UNC but also + // "/c:/foo". This is when there is no scheme, so we can allow pages to do + // links like "c:/foo/bar" or "//foo/bar". This is also called by the + // relative URL resolver when it determines there is an absolute URL, which + // may give us input like "/c:/foo". + after_slashes = begin + num_slashes; + if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) { + // Windows path, don't try to extract the scheme (for example, "c:\foo"). + parsed->scheme.reset(); + after_scheme = after_slashes; + } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) { + // Windows UNC path: don't try to extract the scheme, but keep the slashes. + parsed->scheme.reset(); + after_scheme = begin; + } else +#endif + { + // ExtractScheme doesn't understand the possibility of filenames with + // colons in them, in which case it returns the entire spec up to the + // colon as the scheme. So handle /foo.c:5 as a file but foo.c:5 as + // the foo.c: scheme. + if (!num_slashes && + ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + after_scheme = parsed->scheme.end() + 1; + } else { + // No scheme found, remember that. + parsed->scheme.reset(); + after_scheme = begin; + } + } + + // Handle empty specs ones that contain only whitespace or control chars, + // or that are just the scheme (for example "file:"). + if (after_scheme == spec_len) { + parsed->host.reset(); + parsed->path.reset(); + return; + } + + num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + after_slashes = after_scheme + num_slashes; +#ifdef WIN32 + // Check whether the input is a drive again. We checked above for windows + // drive specs, but that's only at the very beginning to see if we have a + // scheme at all. This test will be duplicated in that case, but will + // additionally handle all cases with a real scheme such as "file:///C:/". + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) && + num_slashes != 3) { + // Anything not beginning with a drive spec ("c:\") on Windows is treated + // as UNC, with the exception of three slashes which always means a file. + // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails. + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#else + // file: URL with exactly 2 slashes is considered to have a host component. + if (num_slashes == 2) { + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#endif // WIN32 + + // Easy and common case, the full path immediately follows the scheme + // (modulo slashes), as in "file://c:/foo". Just treat everything from + // there to the end as the path. Empty hosts have 0 length instead of -1. + // We include the last slash as part of the path if there is one. + DoParseLocalFile(spec, + num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, + spec_len, parsed); +} + +} // namespace + +void ParseFileURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +void ParseFileURL(const char16_t* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +} // namespace url diff --git a/url_parse_internal.h b/url_parse_internal.h new file mode 100644 index 00000000000..a73f13b184c --- /dev/null +++ b/url_parse_internal.h @@ -0,0 +1,96 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_PARSE_INTERNAL_H_ +#define URL_URL_PARSE_INTERNAL_H_ + +// Contains common inline helper functions used by the URL parsing routines. + +#include "url/third_party/mozilla/url_parse.h" + +namespace url { + +// We treat slashes and backslashes the same for IE compatibility. +inline bool IsURLSlash(char16_t ch) { + return ch == '/' || ch == '\\'; +} +inline bool IsURLSlash(char ch) { + return IsURLSlash(static_cast(ch)); +} + +// Returns true if we should trim this character from the URL because it is a +// space or a control character. +inline bool ShouldTrimFromURL(char16_t ch) { + return ch <= ' '; +} +inline bool ShouldTrimFromURL(char ch) { + return ShouldTrimFromURL(static_cast(ch)); +} + +// Given an already-initialized begin index and length, this shrinks the range +// to eliminate "should-be-trimmed" characters. Note that the length does *not* +// indicate the length of untrimmed data from |*begin|, but rather the position +// in the input string (so the string starts at character |*begin| in the spec, +// and goes until |*len|). +template +inline void TrimURL(const CHAR* spec, int* begin, int* len, + bool trim_path_end = true) { + // Strip leading whitespace and control characters. + while (*begin < *len && ShouldTrimFromURL(spec[*begin])) + (*begin)++; + + if (trim_path_end) { + // Strip trailing whitespace and control characters. We need the >i test + // for when the input string is all blanks; we don't want to back past the + // input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) + (*len)--; + } +} + +// Counts the number of consecutive slashes starting at the given offset +// in the given string of the given length. +template +inline int CountConsecutiveSlashes(const CHAR *str, + int begin_offset, int str_len) { + int count = 0; + while (begin_offset + count < str_len && + IsURLSlash(str[begin_offset + count])) + ++count; + return count; +} + +// Internal functions in url_parse.cc that parse the path, that is, everything +// following the authority section. The input is the range of everything +// following the authority section, and the output is the identified ranges. +// +// This is designed for the file URL parser or other consumers who may do +// special stuff at the beginning, but want regular path parsing, it just +// maps to the internal parsing function for paths. +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); +void ParsePathInternal(const char16_t* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); + +// Given a spec and a pointer to the character after the colon following the +// scheme, this parses it and fills in the structure, Every item in the parsed +// structure is filled EXCEPT for the scheme, which is untouched. +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed); +void ParseAfterScheme(const char16_t* spec, + int spec_len, + int after_scheme, + Parsed* parsed); + +} // namespace url + +#endif // URL_URL_PARSE_INTERNAL_H_ diff --git a/url_parse_perftest.cc b/url_parse_perftest.cc new file mode 100644 index 00000000000..7fe1d39b1e9 --- /dev/null +++ b/url_parse_perftest.cc @@ -0,0 +1,135 @@ +// Copyright 2006-2008 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_piece.h" +#include "base/test/perf_time_logger.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" + +namespace { + +TEST(URLParse, FullURL) { + constexpr base::StringPiece kUrl = + "http://me:pass@host/foo/bar.html;param?query=yes#ref"; + + url::Parsed parsed; + base::PerfTimeLogger timer("Full_URL_Parse_AMillion"); + + for (int i = 0; i < 1000000; i++) + url::ParseStandardURL(kUrl.data(), kUrl.size(), &parsed); + timer.Done(); +} + +constexpr base::StringPiece kTypicalUrl1 = + "http://www.google.com/" + "search?q=url+parsing&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-US:" + "official&client=firefox-a"; + +constexpr base::StringPiece kTypicalUrl2 = + "http://www.amazon.com/Stephen-King-Thrillers-Horror-People/dp/0766012336/" + "ref=sr_1_2/133-4144931-4505264?ie=UTF8&s=books&qid=2144880915&sr=8-2"; + +constexpr base::StringPiece kTypicalUrl3 = + "http://store.apple.com/1-800-MY-APPLE/WebObjects/AppleStore.woa/wa/" + "RSLID?nnmm=browse&mco=578E9744&node=home/desktop/mac_pro"; + +TEST(URLParse, TypicalURLParse) { + url::Parsed parsed1; + url::Parsed parsed2; + url::Parsed parsed3; + + // Do this 1/3 of a million times since we do 3 different URLs. + base::PerfTimeLogger parse_timer("Typical_URL_Parse_AMillion"); + for (int i = 0; i < 333333; i++) { + url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1); + url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2); + url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3); + } + parse_timer.Done(); +} + +// Includes both parsing and canonicalization with no mallocs. +TEST(URLParse, TypicalURLParseCanon) { + url::Parsed parsed1; + url::Parsed parsed2; + url::Parsed parsed3; + + base::PerfTimeLogger canon_timer("Typical_Parse_Canon_AMillion"); + url::Parsed out_parsed; + url::RawCanonOutput<1024> output; + for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M + url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1); + output.set_length(0); + url::CanonicalizeStandardURL( + kTypicalUrl1.data(), kTypicalUrl1.size(), parsed1, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2); + output.set_length(0); + url::CanonicalizeStandardURL( + kTypicalUrl2.data(), kTypicalUrl2.size(), parsed2, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3); + output.set_length(0); + url::CanonicalizeStandardURL( + kTypicalUrl3.data(), kTypicalUrl3.size(), parsed3, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output, + &out_parsed); + } + canon_timer.Done(); +} + +// Includes both parsing and canonicalization, and mallocs for the output. +TEST(URLParse, TypicalURLParseCanonStdString) { + url::Parsed parsed1; + url::Parsed parsed2; + url::Parsed parsed3; + + base::PerfTimeLogger canon_timer("Typical_Parse_Canon_AMillion"); + url::Parsed out_parsed; + for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M + url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1); + std::string out1; + url::StdStringCanonOutput output1(&out1); + url::CanonicalizeStandardURL( + kTypicalUrl1.data(), kTypicalUrl1.size(), parsed1, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output1, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2); + std::string out2; + url::StdStringCanonOutput output2(&out2); + url::CanonicalizeStandardURL( + kTypicalUrl2.data(), kTypicalUrl2.size(), parsed2, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output2, + &out_parsed); + + url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3); + std::string out3; + url::StdStringCanonOutput output3(&out3); + url::CanonicalizeStandardURL( + kTypicalUrl3.data(), kTypicalUrl3.size(), parsed3, + url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output3, + &out_parsed); + } + canon_timer.Done(); +} + +TEST(URLParse, GURL) { + base::PerfTimeLogger gurl_timer("Typical_GURL_AMillion"); + for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M + GURL gurl1(kTypicalUrl1); + GURL gurl2(kTypicalUrl2); + GURL gurl3(kTypicalUrl3); + } + gurl_timer.Done(); +} + +} // namespace diff --git a/url_parse_unittest.cc b/url_parse_unittest.cc new file mode 100644 index 00000000000..88b6f05efb5 --- /dev/null +++ b/url_parse_unittest.cc @@ -0,0 +1,687 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "testing/gtest/include/gtest/gtest.h" +#include "url/third_party/mozilla/url_parse.h" + +// Interesting IE file:isms... +// +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// / file:///C:/ +// /foo file:///C:/foo +// Interestingly, IE treats "/" as an alias for "c:\", which makes sense, +// but is weird to think about on Windows. +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace url { +namespace { + +// Used for regular URL parse cases. +struct URLParseCase { + const char* input; + + const char* scheme; + const char* username; + const char* password; + const char* host; + int port; + const char* path; + const char* query; + const char* ref; +}; + +// Simpler version of URLParseCase for testing path URLs. +struct PathURLParseCase { + const char* input; + + const char* scheme; + const char* path; +}; + +// Simpler version of URLParseCase for testing mailto URLs. +struct MailtoURLParseCase { + const char* input; + + const char* scheme; + const char* path; + const char* query; +}; + +// More complicated version of URLParseCase for testing filesystem URLs. +struct FileSystemURLParseCase { + const char* input; + + const char* inner_scheme; + const char* inner_username; + const char* inner_password; + const char* inner_host; + int inner_port; + const char* inner_path; + const char* path; + const char* query; + const char* ref; +}; + +bool ComponentMatches(const char* input, + const char* reference, + const Component& component) { + // Check that the -1 sentinel is the only allowed negative value. + EXPECT_TRUE(component.is_valid() || component.len == -1); + + // Begin should be valid. + EXPECT_LE(0, component.begin); + + // A NULL reference means the component should be nonexistent. + if (!reference) + return component.len == -1; + if (!component.is_valid()) + return false; // Reference is not NULL but we don't have anything + + if (strlen(reference) != static_cast(component.len)) + return false; // Lengths don't match + + // Now check the actual characters. + return strncmp(reference, &input[component.begin], component.len) == 0; +} + +void ExpectInvalidComponent(const Component& component) { + EXPECT_EQ(0, component.begin); + EXPECT_EQ(-1, component.len); +} + +// Parsed ---------------------------------------------------------------------- + +TEST(URLParser, Length) { + const char* length_cases[] = { + // One with everything in it. + "http://user:pass@host:99/foo?bar#baz", + // One with nothing in it. + "", + // Working backwards, let's start taking off stuff from the full one. + "http://user:pass@host:99/foo?bar#", + "http://user:pass@host:99/foo?bar", + "http://user:pass@host:99/foo?", + "http://user:pass@host:99/foo", + "http://user:pass@host:99/", + "http://user:pass@host:99", + "http://user:pass@host:", + "http://user:pass@host", + "http://host", + "http://user@", + "http:", + }; + for (size_t i = 0; i < std::size(length_cases); i++) { + int true_length = static_cast(strlen(length_cases[i])); + + Parsed parsed; + ParseStandardURL(length_cases[i], true_length, &parsed); + + EXPECT_EQ(true_length, parsed.Length()); + } +} + +TEST(URLParser, CountCharactersBefore) { + struct CountCase { + const char* url; + Parsed::ComponentType component; + bool include_delimiter; + int expected_count; + } count_cases[] = { + // Test each possibility in the case where all components are present. + // 0 1 2 + // 0123456789012345678901 + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, true, 0}, + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, false, 0}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, true, 7}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, false, 7}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, true, 9}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, false, 9}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, true, 11}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, false, 11}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, true, 12}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, false, 13}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, false, 14}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, true, 14}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, false, 17}, + {"http://u:p@h:8/p?q#r", Parsed::REF, true, 18}, + {"http://u:p@h:8/p?q#r", Parsed::REF, false, 19}, + // Now test when the requested component is missing. + {"http://u:p@h:8/p?", Parsed::REF, true, 17}, + {"http://u:p@h:8/p?q", Parsed::REF, true, 18}, + {"http://u:p@h:8/p#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8#r", Parsed::PATH, true, 14}, + {"http://u:p@h/", Parsed::PORT, true, 12}, + {"http://u:p@/", Parsed::HOST, true, 11}, + // This case is a little weird. It will report that the password would + // start where the host begins. This is arguably correct, although you + // could also argue that it should start at the '@' sign. Doing it + // starting with the '@' sign is actually harder, so we don't bother. + {"http://u@h/", Parsed::PASSWORD, true, 9}, + {"http://h/", Parsed::USERNAME, true, 7}, + {"http:", Parsed::USERNAME, true, 5}, + {"", Parsed::SCHEME, true, 0}, + // Make sure a random component still works when there's nothing there. + {"", Parsed::REF, true, 0}, + // File URLs are special with no host, so we test those. + {"file:///c:/foo", Parsed::USERNAME, true, 7}, + {"file:///c:/foo", Parsed::PASSWORD, true, 7}, + {"file:///c:/foo", Parsed::HOST, true, 7}, + {"file:///c:/foo", Parsed::PATH, true, 7}, + }; + for (size_t i = 0; i < std::size(count_cases); i++) { + int length = static_cast(strlen(count_cases[i].url)); + + // Simple test to distinguish file and standard URLs. + Parsed parsed; + if (length > 0 && count_cases[i].url[0] == 'f') + ParseFileURL(count_cases[i].url, length, &parsed); + else + ParseStandardURL(count_cases[i].url, length, &parsed); + + int chars_before = parsed.CountCharactersBefore( + count_cases[i].component, count_cases[i].include_delimiter); + EXPECT_EQ(count_cases[i].expected_count, chars_before); + } +} + +// Standard -------------------------------------------------------------------- + +// Input Scheme Usrname Passwd Host Port Path Query Ref +// ------------------------------------ ------- ------- ---------- ------------ --- ---------- ------------ ----- +static URLParseCase cases[] = { + // Regular URL with all the parts +{"http://user:pass@foo:21/bar;par?b#c", "http", "user", "pass", "foo", 21, "/bar;par","b", "c"}, + + // Known schemes should lean towards authority identification +{"http:foo.com", "http", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, + + // Spaces! +{"\t :foo.com \n", "", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{" foo.com ", NULL, NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{"a:\t foo.com", "a", NULL, NULL, "\t foo.com", -1, NULL, NULL, NULL}, +{"http://f:21/ b ? d # e ", "http", NULL, NULL, "f", 21, "/ b ", " d ", " e"}, + + // Invalid port numbers should be identified and turned into -2, empty port + // numbers should be -1. Spaces aren't allowed in port numbers +{"http://f:/c", "http", NULL, NULL, "f", -1, "/c", NULL, NULL}, +{"http://f:0/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000000000080/c", "http", NULL, NULL, "f", 80, "/c", NULL, NULL}, +{"http://f:b/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: /c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:\n/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:fifty-two/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:999999/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: 21 / b ? d # e ", "http", NULL, NULL, "f", -2, "/ b ", " d ", " e"}, + + // Creative URLs missing key elements +{"", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{" \t", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":foo.com/", "", NULL, NULL, "foo.com", -1, "/", NULL, NULL}, +{":foo.com\\", "", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{":", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":a", "", NULL, NULL, "a", -1, NULL, NULL, NULL}, +{":/", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":\\", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":#", "", NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#", NULL, NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#/", NULL, NULL, NULL, NULL, -1, NULL, NULL, "/"}, +{"#\\", NULL, NULL, NULL, NULL, -1, NULL, NULL, "\\"}, +{"#;?", NULL, NULL, NULL, NULL, -1, NULL, NULL, ";?"}, +{"?", NULL, NULL, NULL, NULL, -1, NULL, "", NULL}, +{"/", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":23", "", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"/:23", "/", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"//", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::23", "", NULL, NULL, NULL, 23, NULL, NULL, NULL}, +{"foo://", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + + // Username/passwords and things that look like them +{"http://a:b@c:29/d", "http", "a", "b", "c", 29, "/d", NULL, NULL}, +{"http::@c:29", "http", "", "", "c", 29, NULL, NULL, NULL}, + // ... "]" in the password field isn't allowed, but we tolerate it here... +{"http://&a:foo(b]c@d:2/", "http", "&a", "foo(b]c", "d", 2, "/", NULL, NULL}, +{"http://::@c@d:2", "http", "", ":@c", "d", 2, NULL, NULL, NULL}, +{"http://foo.com:b@d/", "http", "foo.com", "b", "d", -1, "/", NULL, NULL}, + +{"http://foo.com/\\@", "http", NULL, NULL, "foo.com", -1, "/\\@", NULL, NULL}, +{"http:\\\\foo.com\\", "http", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{"http:\\\\a\\b:c\\d@foo.com\\", "http", NULL, NULL, "a", -1, "\\b:c\\d@foo.com\\", NULL, NULL}, + + // Tolerate different numbers of slashes. +{"foo:/", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo:/bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo://///////", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo://///////bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo:////://///", "foo", NULL, NULL, NULL, -1, "/////", NULL, NULL}, + + // Raw file paths on Windows aren't handled by the parser. +{"c:/foo", "c", NULL, NULL, "foo", -1, NULL, NULL, NULL}, +{"//foo/bar", NULL, NULL, NULL, "foo", -1, "/bar", NULL, NULL}, + + // Use the first question mark for the query and the ref. +{"http://foo/path;a??e#f#g", "http", NULL, NULL, "foo", -1, "/path;a", "?e", "f#g"}, +{"http://foo/abcd?efgh?ijkl", "http", NULL, NULL, "foo", -1, "/abcd", "efgh?ijkl", NULL}, +{"http://foo/abcd#foo?bar", "http", NULL, NULL, "foo", -1, "/abcd", NULL, "foo?bar"}, + + // IPv6, check also interesting uses of colons. +{"[61:24:74]:98", "[61", NULL, NULL, "24:74]", 98, NULL, NULL, NULL}, +{"http://[61:27]:98", "http", NULL, NULL, "[61:27]", 98, NULL, NULL, NULL}, +{"http:[61:27]/:foo", "http", NULL, NULL, "[61:27]", -1, "/:foo", NULL, NULL}, +{"http://[1::2]:3:4", "http", NULL, NULL, "[1::2]:3", 4, NULL, NULL, NULL}, + + // Partially-complete IPv6 literals, and related cases. +{"http://2001::1", "http", NULL, NULL, "2001:", 1, NULL, NULL, NULL}, +{"http://[2001::1", "http", NULL, NULL, "[2001::1", -1, NULL, NULL, NULL}, +{"http://2001::1]", "http", NULL, NULL, "2001::1]", -1, NULL, NULL, NULL}, +{"http://2001::1]:80", "http", NULL, NULL, "2001::1]", 80, NULL, NULL, NULL}, +{"http://[2001::1]", "http", NULL, NULL, "[2001::1]", -1, NULL, NULL, NULL}, +{"http://[2001::1]:80", "http", NULL, NULL, "[2001::1]", 80, NULL, NULL, NULL}, +{"http://[[::]]", "http", NULL, NULL, "[[::]]", -1, NULL, NULL, NULL}, + +}; + +TEST(URLParser, Standard) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < std::size(cases); i++) { + const char* url = cases[i].input; + ParseStandardURL(url, static_cast(strlen(url)), &parsed); + int port = ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, cases[i].username, parsed.username)); + EXPECT_TRUE(ComponentMatches(url, cases[i].password, parsed.password)); + EXPECT_TRUE(ComponentMatches(url, cases[i].host, parsed.host)); + EXPECT_EQ(cases[i].port, port); + EXPECT_TRUE(ComponentMatches(url, cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, cases[i].query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, cases[i].ref, parsed.ref)); + } +} + +// PathURL -------------------------------------------------------------------- + +// Various incarnations of path URLs. +static PathURLParseCase path_cases[] = { +{"", NULL, NULL}, +{":", "", NULL}, +{":/", "", "/"}, +{"/", NULL, "/"}, +{" This is \\interesting// \t", NULL, "This is \\interesting// \t"}, +{"about:", "about", NULL}, +{"about:blank", "about", "blank"}, +{" about: blank ", "about", " blank "}, +{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\"); "}, +}; + +TEST(URLParser, PathURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < std::size(path_cases); i++) { + const char* url = path_cases[i].input; + ParsePathURL(url, static_cast(strlen(url)), false, &parsed); + + EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme)) + << i; + EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent())) + << i; + + // The remaining components are never used for path URLs. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.host); + ExpectInvalidComponent(parsed.port); + } +} + +// Various incarnations of file URLs. +static URLParseCase file_cases[] = { +#ifdef WIN32 +{"file:server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL}, +{" file: server \t", "file", NULL, NULL, " server",-1, NULL, NULL, NULL}, +{"FiLe:c|", "FiLe", NULL, NULL, NULL, -1, "c|", NULL, NULL}, +{"FILE:/\\\\/server/file", "FILE", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL}, +{"file://localhost/c:/", "file", NULL, NULL, "localhost", -1, "/c:/", NULL, NULL}, +{"file://127.0.0.1/c|\\", "file", NULL, NULL, "127.0.0.1", -1, "/c|\\", NULL, NULL}, +{"file:/", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + // If there is a Windows drive letter, treat any number of slashes as the + // path part. +{"file:c:\\fo\\b", "file", NULL, NULL, NULL, -1, "c:\\fo\\b", NULL, NULL}, +{"file:/c:\\foo/bar", "file", NULL, NULL, NULL, -1, "/c:\\foo/bar",NULL, NULL}, +{"file://c:/f\\b", "file", NULL, NULL, NULL, -1, "/c:/f\\b", NULL, NULL}, +{"file:///C:/foo", "file", NULL, NULL, NULL, -1, "/C:/foo", NULL, NULL}, +{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, NULL, -1, "/c:\\f\\b", NULL, NULL}, + // If there is not a drive letter, we should treat is as UNC EXCEPT for + // three slashes, which we treat as a Unix style path. +{"file:server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:/server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:///server/file", "file", NULL, NULL, NULL, -1, "/server/file",NULL, NULL}, +{"file://\\server/file", "file", NULL, NULL, NULL, -1, "\\server/file",NULL, NULL}, +{"file:////server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, + // Queries and refs are valid for file URLs as well. +{"file:///C:/foo.html?#", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "", ""}, +{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "query=yes", "ref"}, +#else // WIN32 + // No slashes. + {"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + {"file:path", "file", NULL, NULL, NULL, -1, "path", NULL, NULL}, + {"file:path/", "file", NULL, NULL, NULL, -1, "path/", NULL, NULL}, + {"file:path/f.txt", "file", NULL, NULL, NULL, -1, "path/f.txt", NULL, NULL}, + // One slash. + {"file:/", "file", NULL, NULL, NULL, -1, "/", NULL, NULL}, + {"file:/path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL}, + {"file:/path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL}, + {"file:/path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + // Two slashes. + {"file://", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + {"file://server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL}, + {"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL}, + {"file://server/f.txt", "file", NULL, NULL, "server", -1, "/f.txt", NULL, NULL}, + // Three slashes. + {"file:///", "file", NULL, NULL, NULL, -1, "/", NULL, NULL}, + {"file:///path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL}, + {"file:///path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL}, + {"file:///path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + // More than three slashes. + {"file:////", "file", NULL, NULL, NULL, -1, "/", NULL, NULL}, + {"file:////path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL}, + {"file:////path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL}, + {"file:////path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + // Schemeless URLs + {"path/f.txt", NULL, NULL, NULL, NULL, -1, "path/f.txt", NULL, NULL}, + {"path:80/f.txt", "path", NULL, NULL, NULL, -1, "80/f.txt", NULL, NULL}, + {"path/f.txt:80", "path/f.txt",NULL, NULL, NULL, -1, "80", NULL, NULL}, // Wrong. + {"/path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + {"/path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL}, + {"/path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL}, + {"//server/f.txt", NULL, NULL, NULL, "server", -1, "/f.txt", NULL, NULL}, + {"//server:80/f.txt", NULL, NULL, NULL, "server:80",-1, "/f.txt", NULL, NULL}, + {"//server/f.txt:80", NULL, NULL, NULL, "server", -1, "/f.txt:80", NULL, NULL}, + {"///path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + {"///path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL}, + {"///path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL}, + {"////path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL}, + {"////path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL}, + {"////path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL}, + // Queries and refs are valid for file URLs as well. + {"file:///foo.html?#", "file", NULL, NULL, NULL, -1, "/foo.html", "", ""}, + {"file:///foo.html?q=y#ref", "file", NULL, NULL, NULL, -1, "/foo.html", "q=y", "ref"}, +#endif // WIN32 +}; + +TEST(URLParser, ParseFileURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + Parsed parsed; + for (size_t i = 0; i < std::size(file_cases); i++) { + const char* url = file_cases[i].input; + ParseFileURL(url, static_cast(strlen(url)), &parsed); + int port = ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].scheme, parsed.scheme)) + << " for case #" << i << " [" << url << "] " + << parsed.scheme.begin << ", " << parsed.scheme.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].username, parsed.username)) + << " for case #" << i << " [" << url << "] " + << parsed.username.begin << ", " << parsed.username.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].password, parsed.password)) + << " for case #" << i << " [" << url << "] " + << parsed.password.begin << ", " << parsed.password.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].host, parsed.host)) + << " for case #" << i << " [" << url << "] " + << parsed.host.begin << ", " << parsed.host.len; + + EXPECT_EQ(file_cases[i].port, port) + << " for case #" << i << " [ " << url << "] " << port; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].path, parsed.path)) + << " for case #" << i << " [" << url << "] " + << parsed.path.begin << ", " << parsed.path.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].query, parsed.query)) + << " for case #" << i << " [" << url << "] " + << parsed.query.begin << ", " << parsed.query.len; + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].ref, parsed.ref)) + << " for case #" << i << " [ "<< url << "] " + << parsed.query.begin << ", " << parsed.scheme.len; + } +} + + +TEST(URLParser, ExtractFileName) { + struct FileCase { + const char* input; + const char* expected; + } extract_cases[] = { + {"http://www.google.com", nullptr}, + {"http://www.google.com/", ""}, + {"http://www.google.com/search", "search"}, + {"http://www.google.com/search/", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "bar.html"}, + {"http://www.google.com/foo/bar.html#ref", "bar.html"}, + {"http://www.google.com/search/;param", ""}, + {"http://www.google.com/foo/bar.html;param#ref", "bar.html"}, + {"http://www.google.com/foo/bar.html;foo;param#ref", "bar.html"}, + {"http://www.google.com/foo/bar.html?query#ref", "bar.html"}, + {"http://www.google.com/foo;/bar.html", "bar.html"}, + {"http://www.google.com/foo;/", ""}, + {"http://www.google.com/foo;", "foo"}, + {"http://www.google.com/;", ""}, + {"http://www.google.com/foo;bar;html", "foo"}, + }; + + for (size_t i = 0; i < std::size(extract_cases); i++) { + const char* url = extract_cases[i].input; + int len = static_cast(strlen(url)); + + Parsed parsed; + ParseStandardURL(url, len, &parsed); + + Component file_name; + ExtractFileName(url, parsed.path, &file_name); + + EXPECT_TRUE(ComponentMatches(url, extract_cases[i].expected, file_name)); + } +} + +// Returns true if the parameter with index |parameter| in the given URL's +// query string. The expected key can be NULL to indicate no such key index +// should exist. The parameter number is 1-based. +static bool NthParameterIs(const char* url, + int parameter, + const char* expected_key, + const char* expected_value) { + Parsed parsed; + ParseStandardURL(url, static_cast(strlen(url)), &parsed); + + Component query = parsed.query; + + for (int i = 1; i <= parameter; i++) { + Component key, value; + if (!ExtractQueryKeyValue(url, &query, &key, &value)) { + if (parameter >= i && !expected_key) + return true; // Expected nonexistent key, got one. + return false; // Not enough keys. + } + + if (i == parameter) { + if (!expected_key) + return false; + + if (strncmp(&url[key.begin], expected_key, key.len) != 0) + return false; + if (strncmp(&url[value.begin], expected_value, value.len) != 0) + return false; + return true; + } + } + return expected_key == NULL; // We didn't find that many parameters. +} + +TEST(URLParser, ExtractQueryKeyValue) { + EXPECT_TRUE(NthParameterIs("http://www.google.com", 1, NULL, NULL)); + + // Basic case. + char a[] = "http://www.google.com?arg1=1&arg2=2&bar"; + EXPECT_TRUE(NthParameterIs(a, 1, "arg1", "1")); + EXPECT_TRUE(NthParameterIs(a, 2, "arg2", "2")); + EXPECT_TRUE(NthParameterIs(a, 3, "bar", "")); + EXPECT_TRUE(NthParameterIs(a, 4, NULL, NULL)); + + // Empty param at the end. + char b[] = "http://www.google.com?foo=bar&"; + EXPECT_TRUE(NthParameterIs(b, 1, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(b, 2, NULL, NULL)); + + // Empty param at the beginning. + char c[] = "http://www.google.com?&foo=bar"; + EXPECT_TRUE(NthParameterIs(c, 1, "", "")); + EXPECT_TRUE(NthParameterIs(c, 2, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(c, 3, NULL, NULL)); + + // Empty key with value. + char d[] = "http://www.google.com?=foo"; + EXPECT_TRUE(NthParameterIs(d, 1, "", "foo")); + EXPECT_TRUE(NthParameterIs(d, 2, NULL, NULL)); + + // Empty value with key. + char e[] = "http://www.google.com?foo="; + EXPECT_TRUE(NthParameterIs(e, 1, "foo", "")); + EXPECT_TRUE(NthParameterIs(e, 2, NULL, NULL)); + + // Empty key and values. + char f[] = "http://www.google.com?&&==&="; + EXPECT_TRUE(NthParameterIs(f, 1, "", "")); + EXPECT_TRUE(NthParameterIs(f, 2, "", "")); + EXPECT_TRUE(NthParameterIs(f, 3, "", "=")); + EXPECT_TRUE(NthParameterIs(f, 4, "", "")); + EXPECT_TRUE(NthParameterIs(f, 5, NULL, NULL)); +} + +// MailtoURL -------------------------------------------------------------------- + +static MailtoURLParseCase mailto_cases[] = { +//|input |scheme |path |query +{"mailto:foo@gmail.com", "mailto", "foo@gmail.com", NULL}, +{" mailto: to \t", "mailto", " to", NULL}, +{"mailto:addr1%2C%20addr2 ", "mailto", "addr1%2C%20addr2", NULL}, +{"Mailto:addr1, addr2 ", "Mailto", "addr1, addr2", NULL}, +{"mailto:addr1:addr2 ", "mailto", "addr1:addr2", NULL}, +{"mailto:?to=addr1,addr2", "mailto", NULL, "to=addr1,addr2"}, +{"mailto:?to=addr1%2C%20addr2", "mailto", NULL, "to=addr1%2C%20addr2"}, +{"mailto:addr1?to=addr2", "mailto", "addr1", "to=addr2"}, +{"mailto:?body=#foobar#", "mailto", NULL, "body=#foobar#",}, +{"mailto:#?body=#foobar#", "mailto", "#", "body=#foobar#"}, +}; + +TEST(URLParser, MailtoUrl) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < std::size(mailto_cases); ++i) { + const char* url = mailto_cases[i].input; + ParseMailtoURL(url, static_cast(strlen(url)), &parsed); + int port = ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query)); + EXPECT_EQ(PORT_UNSPECIFIED, port); + + // The remaining components are never used for mailto URLs. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.port); + ExpectInvalidComponent(parsed.ref); + } +} + +// Various incarnations of filesystem URLs. +static FileSystemURLParseCase filesystem_cases[] = { + // Regular URL with all the parts +{"filesystem:http://user:pass@foo:21/temporary/bar;par?b#c", "http", "user", "pass", "foo", 21, "/temporary", "/bar;par", "b", "c"}, +{"filesystem:https://foo/persistent/bar;par/", "https", NULL, NULL, "foo", -1, "/persistent", "/bar;par/", NULL, NULL}, +{"filesystem:file:///persistent/bar;par/", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", NULL, NULL}, +{"filesystem:file:///persistent/bar;par/?query#ref", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", "query", "ref"}, +{"filesystem:file:///persistent", "file", NULL, NULL, NULL, -1, "/persistent", "", NULL, NULL}, +}; + +TEST(URLParser, FileSystemURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the constructor. + Parsed parsed; + for (size_t i = 0; i < std::size(filesystem_cases); i++) { + const FileSystemURLParseCase* parsecase = &filesystem_cases[i]; + const char* url = parsecase->input; + ParseFileSystemURL(url, static_cast(strlen(url)), &parsed); + + EXPECT_TRUE(ComponentMatches(url, "filesystem", parsed.scheme)); + EXPECT_EQ(!parsecase->inner_scheme, !parsed.inner_parsed()); + // Only check the inner_parsed if there is one. + if (parsed.inner_parsed()) { + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_scheme, + parsed.inner_parsed()->scheme)); + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_username, + parsed.inner_parsed()->username)); + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_password, + parsed.inner_parsed()->password)); + EXPECT_TRUE(ComponentMatches(url, parsecase->inner_host, + parsed.inner_parsed()->host)); + int port = ParsePort(url, parsed.inner_parsed()->port); + EXPECT_EQ(parsecase->inner_port, port); + + // The remaining components are never used for filesystem URLs. + ExpectInvalidComponent(parsed.inner_parsed()->query); + ExpectInvalidComponent(parsed.inner_parsed()->ref); + } + + EXPECT_TRUE(ComponentMatches(url, parsecase->path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, parsecase->query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, parsecase->ref, parsed.ref)); + + // The remaining components are never used for filesystem URLs. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.host); + ExpectInvalidComponent(parsed.port); + } +} + +} // namespace +} // namespace url diff --git a/url_test_utils.h b/url_test_utils.h new file mode 100644 index 00000000000..e1be7fc5087 --- /dev/null +++ b/url_test_utils.h @@ -0,0 +1,39 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_TEST_UTILS_H_ +#define URL_URL_TEST_UTILS_H_ + +// Convenience functions for string conversions. +// These are mostly intended for use in unit tests. + +#include + +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/url_canon_internal.h" + +namespace url { + +namespace test_utils { + +// Converts a UTF-16 string from native wchar_t format to char16 by +// truncating the high 32 bits. This is different than the conversion function +// in base bacause it passes invalid UTF-16 characters which is important for +// test purposes. As a result, this is not meant to handle true UTF-32 encoded +// strings. +inline std::u16string TruncateWStringToUTF16(const wchar_t* src) { + std::u16string str; + int length = static_cast(wcslen(src)); + for (int i = 0; i < length; ++i) { + str.push_back(static_cast(src[i])); + } + return str; +} + +} // namespace test_utils + +} // namespace url + +#endif // URL_URL_TEST_UTILS_H_ diff --git a/url_util.cc b/url_util.cc new file mode 100644 index 00000000000..67913eb72f7 --- /dev/null +++ b/url_util.cc @@ -0,0 +1,933 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_util.h" + +#include +#include + +#include +#include + +#include "base/check_op.h" +#include "base/compiler_specific.h" +#include "base/containers/contains.h" +#include "base/no_destructor.h" +#include "base/strings/string_util.h" +#include "url/url_canon_internal.h" +#include "url/url_constants.h" +#include "url/url_file.h" +#include "url/url_util_internal.h" + +namespace url { + +namespace { + +// A pair for representing a standard scheme name and the SchemeType for it. +struct SchemeWithType { + std::string scheme; + SchemeType type; +}; + +// A pair for representing a scheme and a custom protocol handler for it. +// +// This pair of strings must be normalized protocol handler parameters as +// described in the Custom Handler specification. +// https://html.spec.whatwg.org/multipage/system-state.html#normalize-protocol-handler-parameters +struct SchemeWithHandler { + std::string scheme; + std::string handler; +}; + +// List of currently registered schemes and associated properties. +struct SchemeRegistry { + // Standard format schemes (see header for details). + std::vector standard_schemes = { + {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + // Yes, file URLs can have a hostname, so file URLs should be handled as + // "standard". File URLs never have a port as specified by the SchemeType + // field. Unlike other SCHEME_WITH_HOST schemes, the 'host' in a file + // URL may be empty, a behavior which is special-cased during + // canonicalization. + {kFileScheme, SCHEME_WITH_HOST}, + {kFtpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + {kWssScheme, + SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket secure. + {kWsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket. + {kFileSystemScheme, SCHEME_WITHOUT_AUTHORITY}, + }; + + // Schemes that are allowed for referrers. + // + // WARNING: Adding (1) a non-"standard" scheme or (2) a scheme whose URLs have + // opaque origins could lead to surprising behavior in some of the referrer + // generation logic. In order to avoid surprises, be sure to have adequate + // test coverage in each of the multiple code locations that compute + // referrers. + std::vector referrer_schemes = { + {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, + }; + + // Schemes that do not trigger mixed content warning. + std::vector secure_schemes = { + kHttpsScheme, + kWssScheme, + kDataScheme, + kAboutScheme, + }; + + // Schemes that normal pages cannot link to or access (i.e., with the same + // security rules as those applied to "file" URLs). + std::vector local_schemes = { + kFileScheme, + }; + + // Schemes that cause pages loaded with them to not have access to pages + // loaded with any other URL scheme. + std::vector no_access_schemes = { + kAboutScheme, + kJavaScriptScheme, + kDataScheme, + }; + + // Schemes that can be sent CORS requests. + std::vector cors_enabled_schemes = { + kHttpsScheme, + kHttpScheme, + kDataScheme, + }; + + // Schemes that can be used by web to store data (local storage, etc). + std::vector web_storage_schemes = { + kHttpsScheme, kHttpScheme, kFileScheme, kFtpScheme, kWssScheme, kWsScheme, + }; + + // Schemes that can bypass the Content-Security-Policy (CSP) checks. + std::vector csp_bypassing_schemes = {}; + + // Schemes that are strictly empty documents, allowing them to commit + // synchronously. + std::vector empty_document_schemes = { + kAboutScheme, + }; + + // Schemes with a predefined default custom handler. + std::vector predefined_handler_schemes; + + bool allow_non_standard_schemes = false; +}; + +// See the LockSchemeRegistries declaration in the header. +bool scheme_registries_locked = false; + +// Ensure that the schemes aren't modified after first use. +static std::atomic g_scheme_registries_used{false}; + +// Gets the scheme registry without locking the schemes. This should *only* be +// used for adding schemes to the registry. +SchemeRegistry* GetSchemeRegistryWithoutLocking() { + static base::NoDestructor registry; + return registry.get(); +} + +const SchemeRegistry& GetSchemeRegistry() { +#if DCHECK_IS_ON() + g_scheme_registries_used.store(true); +#endif + return *GetSchemeRegistryWithoutLocking(); +} + +// Pass this enum through for methods which would like to know if whitespace +// removal is necessary. +enum WhitespaceRemovalPolicy { + REMOVE_WHITESPACE, + DO_NOT_REMOVE_WHITESPACE, +}; + +// This template converts a given character type to the corresponding +// StringPiece type. +template struct CharToStringPiece { +}; +template<> struct CharToStringPiece { + typedef base::StringPiece Piece; +}; +template <> +struct CharToStringPiece { + typedef base::StringPiece16 Piece; +}; + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +template +inline bool DoCompareSchemeComponent(const CHAR* spec, + const Component& component, + const char* compare_to) { + if (component.is_empty()) + return compare_to[0] == 0; // When component is empty, match empty scheme. + return base::EqualsCaseInsensitiveASCII( + typename CharToStringPiece::Piece(&spec[component.begin], + component.len), + compare_to); +} + +// Returns true and sets |type| to the SchemeType of the given scheme +// identified by |scheme| within |spec| if in |schemes|. +template +bool DoIsInSchemes(const CHAR* spec, + const Component& scheme, + SchemeType* type, + const std::vector& schemes) { + if (scheme.is_empty()) + return false; // Empty or invalid schemes are non-standard. + + for (const SchemeWithType& scheme_with_type : schemes) { + if (base::EqualsCaseInsensitiveASCII( + typename CharToStringPiece::Piece(&spec[scheme.begin], + scheme.len), + scheme_with_type.scheme)) { + *type = scheme_with_type.type; + return true; + } + } + return false; +} + +template +bool DoIsStandard(const CHAR* spec, const Component& scheme, SchemeType* type) { + return DoIsInSchemes(spec, scheme, type, + GetSchemeRegistry().standard_schemes); +} + + +template +bool DoFindAndCompareScheme(const CHAR* str, + int str_len, + const char* compare, + Component* found_scheme) { + // Before extracting scheme, canonicalize the URL to remove any whitespace. + // This matches the canonicalization done in DoCanonicalize function. + STACK_UNINITIALIZED RawCanonOutputT whitespace_buffer; + int spec_len; + const CHAR* spec = + RemoveURLWhitespace(str, str_len, &whitespace_buffer, &spec_len, nullptr); + + Component our_scheme; + if (!ExtractScheme(spec, spec_len, &our_scheme)) { + // No scheme. + if (found_scheme) + *found_scheme = Component(); + return false; + } + if (found_scheme) + *found_scheme = our_scheme; + return DoCompareSchemeComponent(spec, our_scheme, compare); +} + +template +bool DoCanonicalize(const CHAR* spec, + int spec_len, + bool trim_path_end, + WhitespaceRemovalPolicy whitespace_policy, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + // Trim leading C0 control characters and spaces. + int begin = 0; + TrimURL(spec, &begin, &spec_len, trim_path_end); + DCHECK(0 <= begin && begin <= spec_len); + spec += begin; + spec_len -= begin; + + output->ReserveSizeIfNeeded(spec_len); + + // Remove any whitespace from the middle of the relative URL if necessary. + // Possibly this will result in copying to the new buffer. + STACK_UNINITIALIZED RawCanonOutputT whitespace_buffer; + if (whitespace_policy == REMOVE_WHITESPACE) { + spec = RemoveURLWhitespace(spec, spec_len, &whitespace_buffer, &spec_len, + &output_parsed->potentially_dangling_markup); + } + + Parsed parsed_input; +#ifdef WIN32 + // For Windows, we allow things that look like absolute Windows paths to be + // fixed up magically to file URLs. This is done for IE compatibility. For + // example, this will change "c:/foo" into a file URL rather than treating + // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt"). + // There is similar logic in url_canon_relative.cc for + // + // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which + // has no meaning as an absolute path name. This is because browsers on Mac + // & Unix don't generally do this, so there is no compatibility reason for + // doing so. + if (DoesBeginUNCPath(spec, 0, spec_len, false) || + DoesBeginWindowsDriveSpec(spec, 0, spec_len)) { + ParseFileURL(spec, spec_len, &parsed_input); + return CanonicalizeFileURL(spec, spec_len, parsed_input, charset_converter, + output, output_parsed); + } +#endif + + Component scheme; + if (!ExtractScheme(spec, spec_len, &scheme)) + return false; + + // This is the parsed version of the input URL, we have to canonicalize it + // before storing it in our object. + bool success; + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) { + // File URLs are special. + ParseFileURL(spec, spec_len, &parsed_input); + success = CanonicalizeFileURL(spec, spec_len, parsed_input, + charset_converter, output, output_parsed); + } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) { + // Filesystem URLs are special. + ParseFileSystemURL(spec, spec_len, &parsed_input); + success = CanonicalizeFileSystemURL(spec, spec_len, parsed_input, + charset_converter, output, + output_parsed); + + } else if (DoIsStandard(spec, scheme, &scheme_type)) { + // All "normal" URLs. + ParseStandardURL(spec, spec_len, &parsed_input); + success = CanonicalizeStandardURL(spec, spec_len, parsed_input, scheme_type, + charset_converter, output, output_parsed); + + } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) { + // Mailto URLs are treated like standard URLs, with only a scheme, path, + // and query. + ParseMailtoURL(spec, spec_len, &parsed_input); + success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output, + output_parsed); + + } else { + // "Weird" URLs like data: and javascript:. + ParsePathURL(spec, spec_len, trim_path_end, &parsed_input); + success = CanonicalizePathURL(spec, spec_len, parsed_input, output, + output_parsed); + } + return success; +} + +template +bool DoResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const CHAR* in_relative, + int in_relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + // Remove any whitespace from the middle of the relative URL, possibly + // copying to the new buffer. + STACK_UNINITIALIZED RawCanonOutputT whitespace_buffer; + int relative_length; + const CHAR* relative = RemoveURLWhitespace( + in_relative, in_relative_length, &whitespace_buffer, &relative_length, + &output_parsed->potentially_dangling_markup); + + bool base_is_authority_based = false; + bool base_is_hierarchical = false; + if (base_spec && + base_parsed.scheme.is_nonempty()) { + int after_scheme = base_parsed.scheme.end() + 1; // Skip past the colon. + int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme, + base_spec_len); + base_is_authority_based = num_slashes > 1; + base_is_hierarchical = num_slashes > 0; + } + + SchemeType unused_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + bool standard_base_scheme = + base_parsed.scheme.is_nonempty() && + DoIsStandard(base_spec, base_parsed.scheme, &unused_scheme_type); + + bool is_relative; + Component relative_component; + if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length, + (base_is_hierarchical || standard_base_scheme), + &is_relative, &relative_component)) { + // Error resolving. + return false; + } + + // Don't reserve buffer space here. Instead, reserve in DoCanonicalize and + // ReserveRelativeURL, to enable more accurate buffer sizes. + + // Pretend for a moment that |base_spec| is a standard URL. Normally + // non-standard URLs are treated as PathURLs, but if the base has an + // authority we would like to preserve it. + if (is_relative && base_is_authority_based && !standard_base_scheme) { + Parsed base_parsed_authority; + ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority); + if (base_parsed_authority.host.is_nonempty()) { + STACK_UNINITIALIZED RawCanonOutputT temporary_output; + bool did_resolve_succeed = + ResolveRelativeURL(base_spec, base_parsed_authority, false, relative, + relative_component, charset_converter, + &temporary_output, output_parsed); + // The output_parsed is incorrect at this point (because it was built + // based on base_parsed_authority instead of base_parsed) and needs to be + // re-created. + DoCanonicalize(temporary_output.data(), temporary_output.length(), true, + REMOVE_WHITESPACE, charset_converter, output, + output_parsed); + return did_resolve_succeed; + } + } else if (is_relative) { + // Relative, resolve and canonicalize. + bool file_base_scheme = base_parsed.scheme.is_nonempty() && + DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme); + return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme, relative, + relative_component, charset_converter, output, + output_parsed); + } + + // Not relative, canonicalize the input. + return DoCanonicalize(relative, relative_length, true, + DO_NOT_REMOVE_WHITESPACE, charset_converter, output, + output_parsed); +} + +template +bool DoReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed) { + // If the scheme is overridden, just do a simple string substitution and + // re-parse the whole thing. There are lots of edge cases that we really don't + // want to deal with. Like what happens if I replace "http://e:8080/foo" + // with a file. Does it become "file:///E:/8080/foo" where the port number + // becomes part of the path? Parsing that string as a file URL says "yes" + // but almost no sane rule for dealing with the components individually would + // come up with that. + // + // Why allow these crazy cases at all? Programatically, there is almost no + // case for replacing the scheme. The most common case for hitting this is + // in JS when building up a URL using the location object. In this case, the + // JS code expects the string substitution behavior: + // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3 + if (replacements.IsSchemeOverridden()) { + // Canonicalize the new scheme so it is 8-bit and can be concatenated with + // the existing spec. + STACK_UNINITIALIZED RawCanonOutput<128> scheme_replaced; + Component scheme_replaced_parsed; + CanonicalizeScheme(replacements.sources().scheme, + replacements.components().scheme, + &scheme_replaced, &scheme_replaced_parsed); + + // We can assume that the input is canonicalized, which means it always has + // a colon after the scheme (or where the scheme would be). + int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1 + : 1; + if (spec_len - spec_after_colon > 0) { + scheme_replaced.Append(&spec[spec_after_colon], + spec_len - spec_after_colon); + } + + // We now need to completely re-parse the resulting string since its meaning + // may have changed with the different scheme. + STACK_UNINITIALIZED RawCanonOutput<128> recanonicalized; + Parsed recanonicalized_parsed; + DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true, + REMOVE_WHITESPACE, charset_converter, &recanonicalized, + &recanonicalized_parsed); + + // Recurse using the version with the scheme already replaced. This will now + // use the replacement rules for the new scheme. + // + // Warning: this code assumes that ReplaceComponents will re-check all + // components for validity. This is because we can't fail if DoCanonicalize + // failed above since theoretically the thing making it fail could be + // getting replaced here. If ReplaceComponents didn't re-check everything, + // we wouldn't know if something *not* getting replaced is a problem. + // If the scheme-specific replacers are made more intelligent so they don't + // re-check everything, we should instead re-canonicalize the whole thing + // after this call to check validity (this assumes replacing the scheme is + // much much less common than other types of replacements, like clearing the + // ref). + Replacements replacements_no_scheme = replacements; + replacements_no_scheme.SetScheme(NULL, Component()); + // If the input URL has potentially dangling markup, set the flag on the + // output too. Note that in some cases the replacement gets rid of the + // potentially dangling markup, but this ok since the check will fail + // closed. + if (parsed.potentially_dangling_markup) { + out_parsed->potentially_dangling_markup = true; + } + return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(), + recanonicalized_parsed, replacements_no_scheme, + charset_converter, output, out_parsed); + } + + // TODO(csharrison): We could be smarter about size to reserve if this is done + // in callers below, and the code checks to see which components are being + // replaced, and with what length. If this ends up being a hot spot it should + // be changed. + output->ReserveSizeIfNeeded(spec_len); + + // If we get here, then we know the scheme doesn't need to be replaced, so can + // just key off the scheme in the spec to know how to do the replacements. + if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) { + return ReplaceFileURL(spec, parsed, replacements, charset_converter, output, + out_parsed); + } + if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) { + return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter, + output, out_parsed); + } + SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + if (DoIsStandard(spec, parsed.scheme, &scheme_type)) { + return ReplaceStandardURL(spec, parsed, replacements, scheme_type, + charset_converter, output, out_parsed); + } + if (DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) { + return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed); + } + + // Default is a path URL. + return ReplacePathURL(spec, parsed, replacements, output, out_parsed); +} + +void DoSchemeModificationPreamble() { + // If this assert triggers, it means you've called Add*Scheme after + // the SchemeRegistry has been used. + // + // This normally means you're trying to set up a new scheme too late or using + // the SchemeRegistry too early in your application's init process. + DCHECK(!g_scheme_registries_used.load()) + << "Trying to add a scheme after the lists have been used. " + "Make sure that you haven't added any static GURL initializers in tests."; + + // If this assert triggers, it means you've called Add*Scheme after + // LockSchemeRegistries has been called (see the header file for + // LockSchemeRegistries for more). + // + // This normally means you're trying to set up a new scheme too late in your + // application's init process. Locate where your app does this initialization + // and calls LockSchemeRegistries, and add your new scheme there. + DCHECK(!scheme_registries_locked) + << "Trying to add a scheme after the lists have been locked."; +} + +void DoAddSchemeWithHandler(const char* new_scheme, + const char* handler, + std::vector* schemes) { + DoSchemeModificationPreamble(); + DCHECK(schemes); + DCHECK(strlen(new_scheme) > 0); + DCHECK(strlen(handler) > 0); + DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme); + DCHECK(!base::Contains(*schemes, new_scheme, &SchemeWithHandler::scheme)); + schemes->push_back({new_scheme, handler}); +} + +void DoAddScheme(const char* new_scheme, std::vector* schemes) { + DoSchemeModificationPreamble(); + DCHECK(schemes); + DCHECK(strlen(new_scheme) > 0); + DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme); + DCHECK(!base::Contains(*schemes, new_scheme)); + schemes->push_back(new_scheme); +} + +void DoAddSchemeWithType(const char* new_scheme, + SchemeType type, + std::vector* schemes) { + DoSchemeModificationPreamble(); + DCHECK(schemes); + DCHECK(strlen(new_scheme) > 0); + DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme); + DCHECK(!base::Contains(*schemes, new_scheme, &SchemeWithType::scheme)); + schemes->push_back({new_scheme, type}); +} + +} // namespace + +void ClearSchemesForTests() { + DCHECK(!g_scheme_registries_used.load()) + << "Schemes already used " + << "(use ScopedSchemeRegistryForTests to relax for tests)."; + DCHECK(!scheme_registries_locked) + << "Schemes already locked " + << "(use ScopedSchemeRegistryForTests to relax for tests)."; + *GetSchemeRegistryWithoutLocking() = SchemeRegistry(); +} + +class ScopedSchemeRegistryInternal { + public: + ScopedSchemeRegistryInternal() + : registry_(std::make_unique( + *GetSchemeRegistryWithoutLocking())) { + g_scheme_registries_used.store(false); + scheme_registries_locked = false; + } + ~ScopedSchemeRegistryInternal() { + *GetSchemeRegistryWithoutLocking() = *registry_; + g_scheme_registries_used.store(true); + scheme_registries_locked = true; + } + + private: + std::unique_ptr registry_; +}; + +ScopedSchemeRegistryForTests::ScopedSchemeRegistryForTests() + : internal_(std::make_unique()) {} + +ScopedSchemeRegistryForTests::~ScopedSchemeRegistryForTests() = default; + +void EnableNonStandardSchemesForAndroidWebView() { + DoSchemeModificationPreamble(); + GetSchemeRegistryWithoutLocking()->allow_non_standard_schemes = true; +} + +bool AllowNonStandardSchemesForAndroidWebView() { + return GetSchemeRegistry().allow_non_standard_schemes; +} + +void AddStandardScheme(const char* new_scheme, SchemeType type) { + DoAddSchemeWithType(new_scheme, type, + &GetSchemeRegistryWithoutLocking()->standard_schemes); +} + +std::vector GetStandardSchemes() { + std::vector result; + result.reserve(GetSchemeRegistry().standard_schemes.size()); + for (const auto& entry : GetSchemeRegistry().standard_schemes) { + result.push_back(entry.scheme); + } + return result; +} + +void AddReferrerScheme(const char* new_scheme, SchemeType type) { + DoAddSchemeWithType(new_scheme, type, + &GetSchemeRegistryWithoutLocking()->referrer_schemes); +} + +void AddSecureScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistryWithoutLocking()->secure_schemes); +} + +const std::vector& GetSecureSchemes() { + return GetSchemeRegistry().secure_schemes; +} + +void AddLocalScheme(const char* new_scheme) { + DoAddScheme(new_scheme, &GetSchemeRegistryWithoutLocking()->local_schemes); +} + +const std::vector& GetLocalSchemes() { + return GetSchemeRegistry().local_schemes; +} + +void AddNoAccessScheme(const char* new_scheme) { + DoAddScheme(new_scheme, + &GetSchemeRegistryWithoutLocking()->no_access_schemes); +} + +const std::vector& GetNoAccessSchemes() { + return GetSchemeRegistry().no_access_schemes; +} + +void AddCorsEnabledScheme(const char* new_scheme) { + DoAddScheme(new_scheme, + &GetSchemeRegistryWithoutLocking()->cors_enabled_schemes); +} + +const std::vector& GetCorsEnabledSchemes() { + return GetSchemeRegistry().cors_enabled_schemes; +} + +void AddWebStorageScheme(const char* new_scheme) { + DoAddScheme(new_scheme, + &GetSchemeRegistryWithoutLocking()->web_storage_schemes); +} + +const std::vector& GetWebStorageSchemes() { + return GetSchemeRegistry().web_storage_schemes; +} + +void AddCSPBypassingScheme(const char* new_scheme) { + DoAddScheme(new_scheme, + &GetSchemeRegistryWithoutLocking()->csp_bypassing_schemes); +} + +const std::vector& GetCSPBypassingSchemes() { + return GetSchemeRegistry().csp_bypassing_schemes; +} + +void AddEmptyDocumentScheme(const char* new_scheme) { + DoAddScheme(new_scheme, + &GetSchemeRegistryWithoutLocking()->empty_document_schemes); +} + +const std::vector& GetEmptyDocumentSchemes() { + return GetSchemeRegistry().empty_document_schemes; +} + +void AddPredefinedHandlerScheme(const char* new_scheme, const char* handler) { + DoAddSchemeWithHandler( + new_scheme, handler, + &GetSchemeRegistryWithoutLocking()->predefined_handler_schemes); +} + +std::vector> GetPredefinedHandlerSchemes() { + std::vector> result; + result.reserve(GetSchemeRegistry().predefined_handler_schemes.size()); + for (const SchemeWithHandler& entry : + GetSchemeRegistry().predefined_handler_schemes) { + result.emplace_back(entry.scheme, entry.handler); + } + return result; +} + +void LockSchemeRegistries() { + scheme_registries_locked = true; +} + +bool IsStandard(const char* spec, const Component& scheme) { + SchemeType unused_scheme_type; + return DoIsStandard(spec, scheme, &unused_scheme_type); +} + +bool GetStandardSchemeType(const char* spec, + const Component& scheme, + SchemeType* type) { + return DoIsStandard(spec, scheme, type); +} + +bool GetStandardSchemeType(const char16_t* spec, + const Component& scheme, + SchemeType* type) { + return DoIsStandard(spec, scheme, type); +} + +bool IsStandard(const char16_t* spec, const Component& scheme) { + SchemeType unused_scheme_type; + return DoIsStandard(spec, scheme, &unused_scheme_type); +} + +bool IsReferrerScheme(const char* spec, const Component& scheme) { + SchemeType unused_scheme_type; + return DoIsInSchemes(spec, scheme, &unused_scheme_type, + GetSchemeRegistry().referrer_schemes); +} + +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool FindAndCompareScheme(const char16_t* str, + int str_len, + const char* compare, + Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool DomainIs(base::StringPiece canonical_host, + base::StringPiece canonical_domain) { + if (canonical_host.empty() || canonical_domain.empty()) + return false; + + // If the host name ends with a dot but the input domain doesn't, then we + // ignore the dot in the host name. + size_t host_len = canonical_host.length(); + if (canonical_host.back() == '.' && canonical_domain.back() != '.') + --host_len; + + if (host_len < canonical_domain.length()) + return false; + + // |host_first_pos| is the start of the compared part of the host name, not + // start of the whole host name. + const char* host_first_pos = + canonical_host.data() + host_len - canonical_domain.length(); + + if (base::StringPiece(host_first_pos, canonical_domain.length()) != + canonical_domain) { + return false; + } + + // Make sure there aren't extra characters in host before the compared part; + // if the host name is longer than the input domain name, then the character + // immediately before the compared part should be a dot. For example, + // www.google.com has domain "google.com", but www.iamnotgoogle.com does not. + if (canonical_domain[0] != '.' && host_len > canonical_domain.length() && + *(host_first_pos - 1) != '.') { + return false; + } + + return true; +} + +bool HostIsIPAddress(base::StringPiece host) { + STACK_UNINITIALIZED url::RawCanonOutputT ignored_output; + url::CanonHostInfo host_info; + url::CanonicalizeIPAddress(host.data(), Component(0, host.length()), + &ignored_output, &host_info); + return host_info.IsIPAddress(); +} + +bool Canonicalize(const char* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE, + charset_converter, output, output_parsed); +} + +bool Canonicalize(const char16_t* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE, + charset_converter, output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const char* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const char16_t* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +void DecodeURLEscapeSequences(const char* input, + int length, + DecodeURLMode mode, + CanonOutputW* output) { + if (length <= 0) + return; + + STACK_UNINITIALIZED RawCanonOutputT unescaped_chars; + size_t length_size_t = static_cast(length); + for (size_t i = 0; i < length_size_t; i++) { + if (input[i] == '%') { + unsigned char ch; + if (DecodeEscaped(input, &i, length_size_t, &ch)) { + unescaped_chars.push_back(ch); + } else { + // Invalid escape sequence, copy the percent literal. + unescaped_chars.push_back('%'); + } + } else { + // Regular non-escaped 8-bit character. + unescaped_chars.push_back(input[i]); + } + } + + int output_initial_length = output->length(); + // Convert that 8-bit to UTF-16. It's not clear IE does this at all to + // JavaScript URLs, but Firefox and Safari do. + size_t unescaped_length = unescaped_chars.length(); + for (size_t i = 0; i < unescaped_length; i++) { + unsigned char uch = static_cast(unescaped_chars.at(i)); + if (uch < 0x80) { + // Non-UTF-8, just append directly + output->push_back(uch); + } else { + // next_ch will point to the last character of the decoded + // character. + size_t next_character = i; + base_icu::UChar32 code_point; + if (ReadUTFChar(unescaped_chars.data(), &next_character, unescaped_length, + &code_point)) { + // Valid UTF-8 character, convert to UTF-16. + AppendUTF16Value(code_point, output); + i = next_character; + } else if (mode == DecodeURLMode::kUTF8) { + DCHECK_EQ(code_point, 0xFFFD); + AppendUTF16Value(code_point, output); + i = next_character; + } else { + // If there are any sequences that are not valid UTF-8, we + // revert |output| changes, and promote any bytes to UTF-16. We + // copy all characters from the beginning to the end of the + // identified sequence. + output->set_length(output_initial_length); + for (size_t j = 0; j < unescaped_chars.length(); ++j) + output->push_back(static_cast(unescaped_chars.at(j))); + break; + } + } + } +} + +void EncodeURIComponent(const char* input, int length, CanonOutput* output) { + for (int i = 0; i < length; ++i) { + unsigned char c = static_cast(input[i]); + if (IsComponentChar(c)) + output->push_back(c); + else + AppendEscapedChar(c, output); + } +} + +bool CompareSchemeComponent(const char* spec, + const Component& component, + const char* compare_to) { + return DoCompareSchemeComponent(spec, component, compare_to); +} + +bool CompareSchemeComponent(const char16_t* spec, + const Component& component, + const char* compare_to) { + return DoCompareSchemeComponent(spec, component, compare_to); +} + +} // namespace url diff --git a/url_util.h b/url_util.h new file mode 100644 index 00000000000..670552a8ce1 --- /dev/null +++ b/url_util.h @@ -0,0 +1,314 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_UTIL_H_ +#define URL_URL_UTIL_H_ + +#include +#include +#include + +#include "base/component_export.h" +#include "base/strings/string_piece.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_constants.h" + +namespace url { + +// Init ------------------------------------------------------------------------ + +// Used for tests that need to reset schemes. Note that this can only be used +// in conjunction with ScopedSchemeRegistryForTests. +COMPONENT_EXPORT(URL) void ClearSchemesForTests(); + +class ScopedSchemeRegistryInternal; + +// Stores the SchemeRegistry upon creation, allowing tests to modify a copy of +// it, and restores the original SchemeRegistry when deleted. +class COMPONENT_EXPORT(URL) ScopedSchemeRegistryForTests { + public: + ScopedSchemeRegistryForTests(); + ~ScopedSchemeRegistryForTests(); + + private: + std::unique_ptr internal_; +}; + +// Schemes --------------------------------------------------------------------- + +// Changes the behavior of SchemeHostPort / Origin to allow non-standard schemes +// to be specified, instead of canonicalizing them to an invalid SchemeHostPort +// or opaque Origin, respectively. This is used for Android WebView backwards +// compatibility, which allows the use of custom schemes: content hosted in +// Android WebView assumes that one URL with a non-standard scheme will be +// same-origin to another URL with the same non-standard scheme. +// +// Not thread-safe. +COMPONENT_EXPORT(URL) void EnableNonStandardSchemesForAndroidWebView(); + +// Whether or not SchemeHostPort and Origin allow non-standard schemes. +COMPONENT_EXPORT(URL) bool AllowNonStandardSchemesForAndroidWebView(); + +// The following Add*Scheme method are not threadsafe and can not be called +// concurrently with any other url_util function. They will assert if the lists +// of schemes have been locked (see LockSchemeRegistries), or used. + +// Adds an application-defined scheme to the internal list of "standard-format" +// URL schemes. A standard-format scheme adheres to what RFC 3986 calls "generic +// URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). + +COMPONENT_EXPORT(URL) +void AddStandardScheme(const char* new_scheme, SchemeType scheme_type); + +// Returns the list of schemes registered for "standard" URLs. Note, this +// should not be used if you just need to check if your protocol is standard +// or not. Instead use the IsStandard() function above as its much more +// efficient. This function should only be used where you need to perform +// other operations against the standard scheme list. +COMPONENT_EXPORT(URL) +std::vector GetStandardSchemes(); + +// Adds an application-defined scheme to the internal list of schemes allowed +// for referrers. +COMPONENT_EXPORT(URL) +void AddReferrerScheme(const char* new_scheme, SchemeType scheme_type); + +// Adds an application-defined scheme to the list of schemes that do not trigger +// mixed content warnings. +COMPONENT_EXPORT(URL) void AddSecureScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector& GetSecureSchemes(); + +// Adds an application-defined scheme to the list of schemes that normal pages +// cannot link to or access (i.e., with the same security rules as those applied +// to "file" URLs). +COMPONENT_EXPORT(URL) void AddLocalScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector& GetLocalSchemes(); + +// Adds an application-defined scheme to the list of schemes that cause pages +// loaded with them to not have access to pages loaded with any other URL +// scheme. +COMPONENT_EXPORT(URL) void AddNoAccessScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector& GetNoAccessSchemes(); + +// Adds an application-defined scheme to the list of schemes that can be sent +// CORS requests. +COMPONENT_EXPORT(URL) void AddCorsEnabledScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector& GetCorsEnabledSchemes(); + +// Adds an application-defined scheme to the list of web schemes that can be +// used by web to store data (e.g. cookies, local storage, ...). This is +// to differentiate them from schemes that can store data but are not used on +// web (e.g. application's internal schemes) or schemes that are used on web but +// cannot store data. +COMPONENT_EXPORT(URL) void AddWebStorageScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector& GetWebStorageSchemes(); + +// Adds an application-defined scheme to the list of schemes that can bypass the +// Content-Security-Policy (CSP) checks. +COMPONENT_EXPORT(URL) void AddCSPBypassingScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector& GetCSPBypassingSchemes(); + +// Adds an application-defined scheme to the list of schemes that are strictly +// empty documents, allowing them to commit synchronously. +COMPONENT_EXPORT(URL) void AddEmptyDocumentScheme(const char* new_scheme); +COMPONENT_EXPORT(URL) const std::vector& GetEmptyDocumentSchemes(); + +// Adds a scheme with a predefined default handler. +// +// This pair of strings must be normalized protocol handler parameters as +// described in the Custom Handler specification. +// https://html.spec.whatwg.org/multipage/system-state.html#normalize-protocol-handler-parameters +COMPONENT_EXPORT(URL) +void AddPredefinedHandlerScheme(const char* new_scheme, const char* handler); +COMPONENT_EXPORT(URL) +std::vector> GetPredefinedHandlerSchemes(); + +// Sets a flag to prevent future calls to Add*Scheme from succeeding. +// +// This is designed to help prevent errors for multithreaded applications. +// Normal usage would be to call Add*Scheme for your custom schemes at +// the beginning of program initialization, and then LockSchemeRegistries. This +// prevents future callers from mistakenly calling Add*Scheme when the +// program is running with multiple threads, where such usage would be +// dangerous. +// +// We could have had Add*Scheme use a lock instead, but that would add +// some platform-specific dependencies we don't otherwise have now, and is +// overkill considering the normal usage is so simple. +COMPONENT_EXPORT(URL) void LockSchemeRegistries(); + +// Locates the scheme in the given string and places it into |found_scheme|, +// which may be NULL to indicate the caller does not care about the range. +// +// Returns whether the given |compare| scheme matches the scheme found in the +// input (if any). The |compare| scheme must be a valid canonical scheme or +// the result of the comparison is undefined. +COMPONENT_EXPORT(URL) +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + Component* found_scheme); +COMPONENT_EXPORT(URL) +bool FindAndCompareScheme(const char16_t* str, + int str_len, + const char* compare, + Component* found_scheme); +inline bool FindAndCompareScheme(const std::string& str, + const char* compare, + Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast(str.size()), + compare, found_scheme); +} +inline bool FindAndCompareScheme(const std::u16string& str, + const char* compare, + Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast(str.size()), + compare, found_scheme); +} + +// Returns true if the given scheme identified by |scheme| within |spec| is in +// the list of known standard-format schemes (see AddStandardScheme). +COMPONENT_EXPORT(URL) +bool IsStandard(const char* spec, const Component& scheme); +COMPONENT_EXPORT(URL) +bool IsStandard(const char16_t* spec, const Component& scheme); + +// Returns true if the given scheme identified by |scheme| within |spec| is in +// the list of allowed schemes for referrers (see AddReferrerScheme). +COMPONENT_EXPORT(URL) +bool IsReferrerScheme(const char* spec, const Component& scheme); + +// Returns true and sets |type| to the SchemeType of the given scheme +// identified by |scheme| within |spec| if the scheme is in the list of known +// standard-format schemes (see AddStandardScheme). +COMPONENT_EXPORT(URL) +bool GetStandardSchemeType(const char* spec, + const Component& scheme, + SchemeType* type); +COMPONENT_EXPORT(URL) +bool GetStandardSchemeType(const char16_t* spec, + const Component& scheme, + SchemeType* type); + +// Hosts ---------------------------------------------------------------------- + +// Returns true if the |canonical_host| matches or is in the same domain as the +// given |canonical_domain| string. For example, if the canonicalized hostname +// is "www.google.com", this will return true for "com", "google.com", and +// "www.google.com" domains. +// +// If either of the input StringPieces is empty, the return value is false. The +// input domain should match host canonicalization rules. i.e. it should be +// lowercase except for escape chars. +COMPONENT_EXPORT(URL) +bool DomainIs(base::StringPiece canonical_host, + base::StringPiece canonical_domain); + +// Returns true if the hostname is an IP address. Note: this function isn't very +// cheap, as it must re-parse the host to verify. +COMPONENT_EXPORT(URL) bool HostIsIPAddress(base::StringPiece host); + +// URL library wrappers -------------------------------------------------------- + +// Parses the given spec according to the extracted scheme type. Normal users +// should use the URL object, although this may be useful if performance is +// critical and you don't want to do the heap allocation for the std::string. +// +// As with the Canonicalize* functions, the charset converter can +// be NULL to use UTF-8 (it will be faster in this case). +// +// Returns true if a valid URL was produced, false if not. On failure, the +// output and parsed structures will still be filled and will be consistent, +// but they will not represent a loadable URL. +COMPONENT_EXPORT(URL) +bool Canonicalize(const char* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); +COMPONENT_EXPORT(URL) +bool Canonicalize(const char16_t* spec, + int spec_len, + bool trim_path_end, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); + +// Resolves a potentially relative URL relative to the given parsed base URL. +// The base MUST be valid. The resulting canonical URL and parsed information +// will be placed in to the given out variables. +// +// The relative need not be relative. If we discover that it's absolute, this +// will produce a canonical version of that URL. See Canonicalize() for more +// about the charset_converter. +// +// Returns true if the output is valid, false if the input could not produce +// a valid URL. +COMPONENT_EXPORT(URL) +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const char* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); +COMPONENT_EXPORT(URL) +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const Parsed& base_parsed, + const char16_t* relative, + int relative_length, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* output_parsed); + +// Replaces components in the given VALID input URL. The new canonical URL info +// is written to output and out_parsed. +// +// Returns true if the resulting URL is valid. +COMPONENT_EXPORT(URL) +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed); +COMPONENT_EXPORT(URL) +bool ReplaceComponents(const char* spec, + int spec_len, + const Parsed& parsed, + const Replacements& replacements, + CharsetConverter* charset_converter, + CanonOutput* output, + Parsed* out_parsed); + +// String helper functions ----------------------------------------------------- + +enum class DecodeURLMode { + // UTF-8 decode only. Invalid byte sequences are replaced with U+FFFD. + kUTF8, + // Try UTF-8 decoding. If the input contains byte sequences invalid + // for UTF-8, apply byte to Unicode mapping. + kUTF8OrIsomorphic, +}; + +// Unescapes the given string using URL escaping rules. +COMPONENT_EXPORT(URL) +void DecodeURLEscapeSequences(const char* input, + int length, + DecodeURLMode mode, + CanonOutputW* output); + +// Escapes the given string as defined by the JS method encodeURIComponent. See +// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent +COMPONENT_EXPORT(URL) +void EncodeURIComponent(const char* input, int length, CanonOutput* output); + +} // namespace url + +#endif // URL_URL_UTIL_H_ diff --git a/url_util_internal.h b/url_util_internal.h new file mode 100644 index 00000000000..fe2a4d93bb3 --- /dev/null +++ b/url_util_internal.h @@ -0,0 +1,23 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_URL_UTIL_INTERNAL_H_ +#define URL_URL_UTIL_INTERNAL_H_ + +#include "url/third_party/mozilla/url_parse.h" + +namespace url { + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +bool CompareSchemeComponent(const char* spec, + const Component& component, + const char* compare_to); +bool CompareSchemeComponent(const char16_t* spec, + const Component& component, + const char* compare_to); + +} // namespace url + +#endif // URL_URL_UTIL_INTERNAL_H_ diff --git a/url_util_unittest.cc b/url_util_unittest.cc new file mode 100644 index 00000000000..e1d7801b011 --- /dev/null +++ b/url_util_unittest.cc @@ -0,0 +1,631 @@ +// Copyright 2013 The Chromium Authors +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/url_util.h" + +#include + +#include "base/strings/string_piece.h" +#include "build/build_config.h" +#include "testing/gtest/include/gtest/gtest-message.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "third_party/abseil-cpp/absl/types/optional.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_test_utils.h" + +namespace url { + +class URLUtilTest : public testing::Test { + public: + URLUtilTest() = default; + + URLUtilTest(const URLUtilTest&) = delete; + URLUtilTest& operator=(const URLUtilTest&) = delete; + + ~URLUtilTest() override = default; + + private: + ScopedSchemeRegistryForTests scoped_registry_; +}; + +TEST_F(URLUtilTest, FindAndCompareScheme) { + Component found_scheme; + + // Simple case where the scheme is found and matches. + const char kStr1[] = "http://www.com/"; + EXPECT_TRUE(FindAndCompareScheme( + kStr1, static_cast(strlen(kStr1)), "http", NULL)); + EXPECT_TRUE(FindAndCompareScheme( + kStr1, static_cast(strlen(kStr1)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(0, 4)); + + // A case where the scheme is found and doesn't match. + EXPECT_FALSE(FindAndCompareScheme( + kStr1, static_cast(strlen(kStr1)), "https", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(0, 4)); + + // A case where there is no scheme. + const char kStr2[] = "httpfoobar"; + EXPECT_FALSE(FindAndCompareScheme( + kStr2, static_cast(strlen(kStr2)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == Component()); + + // When there is an empty scheme, it should match the empty scheme. + const char kStr3[] = ":foo.com/"; + EXPECT_TRUE(FindAndCompareScheme( + kStr3, static_cast(strlen(kStr3)), "", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(0, 0)); + + // But when there is no scheme, it should fail. + EXPECT_FALSE(FindAndCompareScheme("", 0, "", &found_scheme)); + EXPECT_TRUE(found_scheme == Component()); + + // When there is a whitespace char in scheme, it should canonicalize the URL + // before comparison. + const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)"; + EXPECT_TRUE(FindAndCompareScheme(whtspc_str, + static_cast(strlen(whtspc_str)), + "javascript", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(1, 10)); + + // Control characters should be stripped out on the ends, and kept in the + // middle. + const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)"; + EXPECT_FALSE(FindAndCompareScheme(ctrl_str, + static_cast(strlen(ctrl_str)), + "javascript", &found_scheme)); + EXPECT_TRUE(found_scheme == Component(1, 11)); +} + +TEST_F(URLUtilTest, IsStandard) { + const char kHTTPScheme[] = "http"; + EXPECT_TRUE(IsStandard(kHTTPScheme, Component(0, strlen(kHTTPScheme)))); + + const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsStandard(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, IsReferrerScheme) { + const char kHTTPScheme[] = "http"; + EXPECT_TRUE(IsReferrerScheme(kHTTPScheme, Component(0, strlen(kHTTPScheme)))); + + const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, AddReferrerScheme) { + static const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); + + url::ScopedSchemeRegistryForTests scoped_registry; + AddReferrerScheme(kFooScheme, url::SCHEME_WITH_HOST); + EXPECT_TRUE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, ShutdownCleansUpSchemes) { + static const char kFooScheme[] = "foo"; + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); + + { + url::ScopedSchemeRegistryForTests scoped_registry; + AddReferrerScheme(kFooScheme, url::SCHEME_WITH_HOST); + EXPECT_TRUE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); + } + + EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme)))); +} + +TEST_F(URLUtilTest, GetStandardSchemeType) { + url::SchemeType scheme_type; + + const char kHTTPScheme[] = "http"; + scheme_type = url::SCHEME_WITHOUT_AUTHORITY; + EXPECT_TRUE(GetStandardSchemeType(kHTTPScheme, + Component(0, strlen(kHTTPScheme)), + &scheme_type)); + EXPECT_EQ(url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, scheme_type); + + const char kFilesystemScheme[] = "filesystem"; + scheme_type = url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + EXPECT_TRUE(GetStandardSchemeType(kFilesystemScheme, + Component(0, strlen(kFilesystemScheme)), + &scheme_type)); + EXPECT_EQ(url::SCHEME_WITHOUT_AUTHORITY, scheme_type); + + const char kFooScheme[] = "foo"; + scheme_type = url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION; + EXPECT_FALSE(GetStandardSchemeType(kFooScheme, + Component(0, strlen(kFooScheme)), + &scheme_type)); +} + +TEST_F(URLUtilTest, GetStandardSchemes) { + std::vector expected = { + kHttpsScheme, kHttpScheme, kFileScheme, kFtpScheme, + kWssScheme, kWsScheme, kFileSystemScheme, "foo", + }; + AddStandardScheme("foo", url::SCHEME_WITHOUT_AUTHORITY); + EXPECT_EQ(expected, GetStandardSchemes()); +} + +TEST_F(URLUtilTest, ReplaceComponents) { + Parsed parsed; + RawCanonOutputT output; + Parsed new_parsed; + + // Check that the following calls do not cause crash + Replacements replacements; + replacements.SetRef("test", Component(0, 4)); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); + replacements.ClearRef(); + replacements.SetHost("test", Component(0, 4)); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); + + replacements.ClearHost(); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed); + ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed); +} + +static std::string CheckReplaceScheme(const char* base_url, + const char* scheme) { + // Make sure the input is canonicalized. + RawCanonOutput<32> original; + Parsed original_parsed; + Canonicalize(base_url, strlen(base_url), true, NULL, &original, + &original_parsed); + + Replacements replacements; + replacements.SetScheme(scheme, Component(0, strlen(scheme))); + + std::string output_string; + StdStringCanonOutput output(&output_string); + Parsed output_parsed; + ReplaceComponents(original.data(), original.length(), original_parsed, + replacements, NULL, &output, &output_parsed); + + output.Complete(); + return output_string; +} + +TEST_F(URLUtilTest, ReplaceScheme) { + EXPECT_EQ("https://google.com/", + CheckReplaceScheme("http://google.com/", "https")); + EXPECT_EQ("file://google.com/", + CheckReplaceScheme("http://google.com/", "file")); + EXPECT_EQ("http://home/Build", + CheckReplaceScheme("file:///Home/Build", "http")); + EXPECT_EQ("javascript:foo", + CheckReplaceScheme("about:foo", "javascript")); + EXPECT_EQ("://google.com/", + CheckReplaceScheme("http://google.com/", "")); + EXPECT_EQ("http://google.com/", + CheckReplaceScheme("about:google.com", "http")); + EXPECT_EQ("http:", CheckReplaceScheme("", "http")); + +#ifdef WIN32 + // Magic Windows drive letter behavior when converting to a file URL. + EXPECT_EQ("file:///E:/foo/", + CheckReplaceScheme("http://localhost/e:foo/", "file")); +#endif + + // This will probably change to "about://google.com/" when we fix + // http://crbug.com/160 which should also be an acceptable result. + EXPECT_EQ("about://google.com/", + CheckReplaceScheme("http://google.com/", "about")); + + EXPECT_EQ("http://example.com/%20hello%20#%20world", + CheckReplaceScheme("myscheme:example.com/ hello # world ", "http")); +} + +TEST_F(URLUtilTest, DecodeURLEscapeSequences) { + struct DecodeCase { + const char* input; + const char* output; + } decode_cases[] = { + {"hello, world", "hello, world"}, + {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/", + "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"}, + {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/", + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"}, + {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/", + " !\"#$%&'()*+,-.//"}, + {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/", + "0123456789:;<=>?/"}, + {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/", + "@ABCDEFGHIJKLMNO/"}, + {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/", + "PQRSTUVWXYZ[\\]^_/"}, + {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/", + "`abcdefghijklmno/"}, + {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/", + "pqrstuvwxyz{|}~\x7f/"}, + {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"}, + }; + + for (size_t i = 0; i < std::size(decode_cases); i++) { + const char* input = decode_cases[i].input; + RawCanonOutputT output; + DecodeURLEscapeSequences(input, strlen(input), + DecodeURLMode::kUTF8OrIsomorphic, &output); + EXPECT_EQ(decode_cases[i].output, base::UTF16ToUTF8(std::u16string( + output.data(), output.length()))); + + RawCanonOutputT output_utf8; + DecodeURLEscapeSequences(input, strlen(input), DecodeURLMode::kUTF8, + &output_utf8); + EXPECT_EQ(decode_cases[i].output, + base::UTF16ToUTF8( + std::u16string(output_utf8.data(), output_utf8.length()))); + } + + // Our decode should decode %00 + const char zero_input[] = "%00"; + RawCanonOutputT zero_output; + DecodeURLEscapeSequences(zero_input, strlen(zero_input), DecodeURLMode::kUTF8, + &zero_output); + EXPECT_NE("%00", base::UTF16ToUTF8(std::u16string(zero_output.data(), + zero_output.length()))); + + // Test the error behavior for invalid UTF-8. + struct Utf8DecodeCase { + const char* input; + std::vector expected_iso; + std::vector expected_utf8; + } utf8_decode_cases[] = { + // %e5%a5%bd is a valid UTF-8 sequence. U+597D + {"%e4%a0%e5%a5%bd", + {0x00e4, 0x00a0, 0x00e5, 0x00a5, 0x00bd, 0}, + {0xfffd, 0x597d, 0}}, + {"%e5%a5%bd%e4%a0", + {0x00e5, 0x00a5, 0x00bd, 0x00e4, 0x00a0, 0}, + {0x597d, 0xfffd, 0}}, + {"%e4%a0%e5%bd", + {0x00e4, 0x00a0, 0x00e5, 0x00bd, 0}, + {0xfffd, 0xfffd, 0}}, + }; + + for (const auto& test : utf8_decode_cases) { + const char* input = test.input; + RawCanonOutputT output_iso; + DecodeURLEscapeSequences(input, strlen(input), + DecodeURLMode::kUTF8OrIsomorphic, &output_iso); + EXPECT_EQ(std::u16string(test.expected_iso.data()), + std::u16string(output_iso.data(), output_iso.length())); + + RawCanonOutputT output_utf8; + DecodeURLEscapeSequences(input, strlen(input), DecodeURLMode::kUTF8, + &output_utf8); + EXPECT_EQ(std::u16string(test.expected_utf8.data()), + std::u16string(output_utf8.data(), output_utf8.length())); + } +} + +TEST_F(URLUtilTest, TestEncodeURIComponent) { + struct EncodeCase { + const char* input; + const char* output; + } encode_cases[] = { + {"hello, world", "hello%2C%20world"}, + {"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"}, + {"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"}, + {" !\"#$%&'()*+,-./", + "%20!%22%23%24%25%26%27()*%2B%2C-.%2F"}, + {"0123456789:;<=>?", + "0123456789%3A%3B%3C%3D%3E%3F"}, + {"@ABCDEFGHIJKLMNO", + "%40ABCDEFGHIJKLMNO"}, + {"PQRSTUVWXYZ[\\]^_", + "PQRSTUVWXYZ%5B%5C%5D%5E_"}, + {"`abcdefghijklmno", + "%60abcdefghijklmno"}, + {"pqrstuvwxyz{|}~\x7f", + "pqrstuvwxyz%7B%7C%7D~%7F"}, + }; + + for (size_t i = 0; i < std::size(encode_cases); i++) { + const char* input = encode_cases[i].input; + RawCanonOutputT buffer; + EncodeURIComponent(input, strlen(input), &buffer); + std::string output(buffer.data(), buffer.length()); + EXPECT_EQ(encode_cases[i].output, output); + } +} + +TEST_F(URLUtilTest, TestResolveRelativeWithNonStandardBase) { + // This tests non-standard (in the sense that IsStandard() == false) + // hierarchical schemes. + struct ResolveRelativeCase { + const char* base; + const char* rel; + bool is_valid; + const char* out; + } resolve_non_standard_cases[] = { + // Resolving a relative path against a non-hierarchical URL should fail. + {"scheme:opaque_data", "/path", false, ""}, + // Resolving a relative path against a non-standard authority-based base + // URL doesn't alter the authority section. + {"scheme://Authority/", "../path", true, "scheme://Authority/path"}, + // A non-standard hierarchical base is resolved with path URL + // canonicalization rules. + {"data:/Blah:Blah/", "file.html", true, "data:/Blah:Blah/file.html"}, + {"data:/Path/../part/part2", "file.html", true, + "data:/Path/../part/file.html"}, + {"data://text/html,payload", "//user:pass@host:33////payload22", true, + "data://user:pass@host:33////payload22"}, + // Path URL canonicalization rules also apply to non-standard authority- + // based URLs. + {"custom://Authority/", "file.html", true, + "custom://Authority/file.html"}, + {"custom://Authority/", "other://Auth/", true, "other://Auth/"}, + {"custom://Authority/", "../../file.html", true, + "custom://Authority/file.html"}, + {"custom://Authority/path/", "file.html", true, + "custom://Authority/path/file.html"}, + {"custom://Authority:NoCanon/path/", "file.html", true, + "custom://Authority:NoCanon/path/file.html"}, + // It's still possible to get an invalid path URL. + {"custom://Invalid:!#Auth/", "file.html", false, ""}, + // A path with an authority section gets canonicalized under standard URL + // rules, even though the base was non-standard. + {"content://content.Provider/", "//other.Provider", true, + "content://other.provider/"}, + + // Resolving an absolute URL doesn't cause canonicalization of the + // result. + {"about:blank", "custom://Authority", true, "custom://Authority"}, + // Fragment URLs can be resolved against a non-standard base. + {"scheme://Authority/path", "#fragment", true, + "scheme://Authority/path#fragment"}, + {"scheme://Authority/", "#fragment", true, + "scheme://Authority/#fragment"}, + // Resolving should fail if the base URL is authority-based but is + // missing a path component (the '/' at the end). + {"scheme://Authority", "path", false, ""}, + // Test resolving a fragment (only) against any kind of base-URL. + {"about:blank", "#id42", true, "about:blank#id42"}, + {"about:blank", " #id42", true, "about:blank#id42"}, + {"about:blank#oldfrag", "#newfrag", true, "about:blank#newfrag"}, + {"about:blank", " #id:42", true, "about:blank#id:42"}, + // A surprising side effect of allowing fragments to resolve against + // any URL scheme is we might break javascript: URLs by doing so... + {"javascript:alert('foo#bar')", "#badfrag", true, + "javascript:alert('foo#badfrag"}, + // In this case, the backslashes will not be canonicalized because it's a + // non-standard URL, but they will be treated as a path separators, + // giving the base URL here a path of "\". + // + // The result here is somewhat arbitrary. One could argue it should be + // either "aaa://a\" or "aaa://a/" since the path is being replaced with + // the "current directory". But in the context of resolving on data URLs, + // adding the requested dot doesn't seem wrong either. + {"aaa://a\\", "aaa:.", true, "aaa://a\\."}}; + + for (size_t i = 0; i < std::size(resolve_non_standard_cases); i++) { + const ResolveRelativeCase& test_data = resolve_non_standard_cases[i]; + Parsed base_parsed; + ParsePathURL(test_data.base, strlen(test_data.base), false, &base_parsed); + + std::string resolved; + StdStringCanonOutput output(&resolved); + Parsed resolved_parsed; + bool valid = ResolveRelative(test_data.base, strlen(test_data.base), + base_parsed, test_data.rel, + strlen(test_data.rel), NULL, &output, + &resolved_parsed); + output.Complete(); + + EXPECT_EQ(test_data.is_valid, valid) << i; + if (test_data.is_valid && valid) + EXPECT_EQ(test_data.out, resolved) << i; + } +} + +TEST_F(URLUtilTest, TestNoRefComponent) { + // The hash-mark must be ignored when mailto: scheme is parsed, + // even if the URL has a base and relative part. + const char* base = "mailto://to/"; + const char* rel = "any#body"; + + Parsed base_parsed; + ParsePathURL(base, strlen(base), false, &base_parsed); + + std::string resolved; + StdStringCanonOutput output(&resolved); + Parsed resolved_parsed; + + bool valid = ResolveRelative(base, strlen(base), + base_parsed, rel, + strlen(rel), NULL, &output, + &resolved_parsed); + EXPECT_TRUE(valid); + EXPECT_FALSE(resolved_parsed.ref.is_valid()); +} + +TEST_F(URLUtilTest, PotentiallyDanglingMarkup) { + struct ResolveRelativeCase { + const char* base; + const char* rel; + bool potentially_dangling_markup; + const char* out; + } cases[] = { + {"https://example.com/", "/path<", false, "https://example.com/path%3C"}, + {"https://example.com/", "\n/path<", true, "https://example.com/path%3C"}, + {"https://example.com/", "\r/path<", true, "https://example.com/path%3C"}, + {"https://example.com/", "\t/path<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/pa\nth<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/pa\rth<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/pa\tth<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/path\n<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"}, + {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"}, + {"https://example.com/", "\n/ original; + const char* url = "htt\nps://example.com/ replacements; + replacements.ClearRef(); + Parsed replaced_parsed; + RawCanonOutput<32> replaced; + ReplaceComponents(original.data(), original.length(), original_parsed, + replacements, nullptr, &replaced, &replaced_parsed); + EXPECT_TRUE(replaced_parsed.potentially_dangling_markup); +} + +TEST_F(URLUtilTest, PotentiallyDanglingMarkupAfterSchemeOnlyReplacement) { + // Parse a URL with potentially dangling markup. + Parsed original_parsed; + RawCanonOutput<32> original; + const char* url = "http://example.com/\n/ replacements; + const char* new_scheme = "https"; + replacements.SetScheme(new_scheme, Component(0, strlen(new_scheme))); + Parsed replaced_parsed; + RawCanonOutput<32> replaced; + ReplaceComponents(original.data(), original.length(), original_parsed, + replacements, nullptr, &replaced, &replaced_parsed); + EXPECT_TRUE(replaced_parsed.potentially_dangling_markup); +} + +TEST_F(URLUtilTest, TestDomainIs) { + const struct { + const char* canonicalized_host; + const char* lower_ascii_domain; + bool expected_domain_is; + } kTestCases[] = { + {"google.com", "google.com", true}, + {"www.google.com", "google.com", true}, // Subdomain is ignored. + {"www.google.com.cn", "google.com", false}, // Different TLD. + {"www.google.comm", "google.com", false}, + {"www.iamnotgoogle.com", "google.com", false}, // Different hostname. + {"www.google.com", "Google.com", false}, // The input is not lower-cased. + + // If the host ends with a dot, it matches domains with or without a dot. + {"www.google.com.", "google.com", true}, + {"www.google.com.", "google.com.", true}, + {"www.google.com.", ".com", true}, + {"www.google.com.", ".com.", true}, + + // But, if the host doesn't end with a dot and the input domain does, then + // it's considered to not match. + {"www.google.com", "google.com.", false}, + + // If the host ends with two dots, it doesn't match. + {"www.google.com..", "google.com", false}, + + // Empty parameters. + {"www.google.com", "", false}, + {"", "www.google.com", false}, + {"", "", false}, + }; + + for (const auto& test_case : kTestCases) { + SCOPED_TRACE(testing::Message() << "(host, domain): (" + << test_case.canonicalized_host << ", " + << test_case.lower_ascii_domain << ")"); + + EXPECT_EQ( + test_case.expected_domain_is, + DomainIs(test_case.canonicalized_host, test_case.lower_ascii_domain)); + } +} + +namespace { +absl::optional CanonicalizeSpec(base::StringPiece spec, + bool trim_path_end) { + std::string canonicalized; + StdStringCanonOutput output(&canonicalized); + Parsed parsed; + if (!Canonicalize(spec.data(), spec.size(), trim_path_end, + /*charset_converter=*/nullptr, &output, &parsed)) { + return {}; + } + output.Complete(); // Must be called before string is used. + return canonicalized; +} +} // namespace + +#if BUILDFLAG(IS_WIN) +// Regression test for https://crbug.com/1252658. +TEST_F(URLUtilTest, TestCanonicalizeWindowsPathWithLeadingNUL) { + auto PrefixWithNUL = [](std::string&& s) -> std::string { return '\0' + s; }; + EXPECT_EQ(CanonicalizeSpec(PrefixWithNUL("w:"), /*trim_path_end=*/false), + absl::make_optional("file:///W:")); + EXPECT_EQ(CanonicalizeSpec(PrefixWithNUL("\\\\server\\share"), + /*trim_path_end=*/false), + absl::make_optional("file://server/share")); +} +#endif + +TEST_F(URLUtilTest, TestCanonicalizeIdempotencyWithLeadingControlCharacters) { + std::string spec = "_w:"; + // Loop over all C0 control characters and the space character. + for (char c = '\0'; c <= ' '; c++) { + SCOPED_TRACE(testing::Message() << "c: " << c); + + // Overwrite the first character of `spec`. Note that replacing the first + // character with NUL will not change the length! + spec[0] = c; + + for (bool trim_path_end : {false, true}) { + SCOPED_TRACE(testing::Message() << "trim_path_end: " << trim_path_end); + + absl::optional canonicalized = + CanonicalizeSpec(spec, trim_path_end); + ASSERT_TRUE(canonicalized); + EXPECT_EQ(canonicalized, CanonicalizeSpec(*canonicalized, trim_path_end)); + } + } +} + +} // namespace url