Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip URLs of leading and trailing non-breaking space (and space, but we already did) #126

Merged
merged 10 commits into from
Feb 5, 2019
32 changes: 28 additions & 4 deletions lib/twingly/url.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,24 @@ class URL
Addressable::URI::InvalidURIError,
PublicSuffix::DomainInvalid,
].freeze
NBSP = "\u00A0"
SPACE = "\u0020"
WHITESPACE_CHARS = [
NBSP,
SPACE,
].join.freeze
LEADING_AND_TRAILING_WHITESPACE =
/\A[#{WHITESPACE_CHARS}]+|[#{WHITESPACE_CHARS}]+\z/.freeze

private_constant :ACCEPTED_SCHEMES
private_constant :CUSTOM_PSL
private_constant :STARTS_WITH_WWW
private_constant :ENDS_WITH_SLASH
private_constant :ERRORS_TO_EXTEND
private_constant :NBSP
private_constant :SPACE
private_constant :WHITESPACE_CHARS
private_constant :LEADING_AND_TRAILING_WHITESPACE

class << self
def parse(potential_url)
Expand All @@ -39,7 +51,8 @@ def parse(potential_url)
raise
end

def internal_parse(potential_url)
def internal_parse(input)
potential_url = clean_input(input)
addressable_uri = to_addressable_uri(potential_url)
raise Twingly::URL::Error::ParseError if addressable_uri.nil?

Expand All @@ -62,13 +75,22 @@ def internal_parse(potential_url)
raise
end

def clean_input(input)
input = String(input)
input = input.scrub
input = strip_whitespace(input)
end

def strip_whitespace(input)
return input unless input.encoding == Encoding::UTF_8

input.gsub(LEADING_AND_TRAILING_WHITESPACE, "")
end

def to_addressable_uri(potential_url)
if potential_url.is_a?(Addressable::URI)
walro marked this conversation as resolved.
Show resolved Hide resolved
potential_url
else
potential_url = String(potential_url)
potential_url = potential_url.scrub

Addressable::URI.heuristic_parse(potential_url)
end
end
Expand All @@ -87,6 +109,8 @@ def try_addressable_normalize(addressable_uri)

private :new
private :internal_parse
private :clean_input
private :strip_whitespace
private :to_addressable_uri
private :try_addressable_normalize
end
Expand Down
23 changes: 21 additions & 2 deletions spec/lib/twingly/url_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ def valid_urls
]
end

def leading_and_trailing_whitespace
{
"non-breaking space and space" => "\u00A0\u0020",
"non-breaking space" => "\u00A0",
"non-breaking space, space, non-breaking space" => "\u00A0\u0020\u00A0",
"space and non-breaking space" => "\u0020\u00A0",
"space, non-breaking space and space" => "\u0020\u00A0\u0020",
}
end

describe Twingly::URL do
let(:unicode_idn_test_url) do
"http://räksmörgås.макдональдс.рф/foo"
Expand Down Expand Up @@ -145,14 +155,14 @@ def valid_urls
end
end

context "with url containing starting and trailing new lines" do
context "with url containing leading and trailing new lines" do
let(:test_url) { "\nhttp://www.twingly.com/blog-data/\r\n" }
let(:expected) { "http://www.twingly.com/blog-data/" }

it { is_expected.to eq(expected) }
end

context "with url containing starting and trailing whitespaces" do
context "with url containing leading and trailing whitespaces" do
let(:test_url) { " http://www.twingly.com/blog-data/ " }
let(:expected) { "http://www.twingly.com/blog-data/" }

Expand All @@ -165,6 +175,15 @@ def valid_urls

it { is_expected.to eq(expected) }
end

leading_and_trailing_whitespace.each do |whitespace_name, whitespace|
context "with url containing leading and trailing: #{whitespace_name}" do
let(:test_url) { "#{whitespace}https://www.example.com/#{whitespace}" }
let(:expected) { "https://www.example.com/" }

it { is_expected.to eq(expected) }
end
end
end

describe ".internal_parse" do
Expand Down