Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync known behaviour with .NET #37

Merged
merged 11 commits into from
Sep 9, 2015
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
language: ruby
sudo: false

rvm:
- 2.2
Expand Down
15 changes: 11 additions & 4 deletions lib/twingly/url.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,23 @@ def parse(potential_url)
end

def extract_url_and_domain(potential_url)
url = Addressable::URI.heuristic_parse(potential_url)
domain = PublicSuffix.parse(url.host) if url
addressable_uri = Addressable::URI.heuristic_parse(potential_url)

[url, domain]
return invalid_url unless addressable_uri

domain = PublicSuffix.parse(addressable_uri.display_uri.host)

[addressable_uri, domain]
rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
[]
invalid_url
end

def validate(potential_url)
parse(potential_url).valid?
end

def invalid_url
[nil, nil]
end
end
end
50 changes: 42 additions & 8 deletions lib/twingly/url/normalizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ module URL
module Normalizer
module_function

ENDS_WITH_SLASH = /\/+$/

def normalize(potential_urls)
extract_urls(potential_urls).map do |potential_url|
normalize_url(potential_url)
Expand All @@ -16,19 +18,51 @@ def extract_urls(potential_urls)
end

def normalize_url(potential_url)
result = Twingly::URL.parse(potential_url)
url_object = Twingly::URL.parse(potential_url)

return nil unless result.valid?
return nil unless url_object.valid?

unless result.domain.subdomain?
result.url.host = "www.#{result.domain}"
end
url_object.url.scheme = extract_normalized_scheme(url_object)
url_object.url.host = extract_normalized_host(url_object)
url_object.url.path = extract_normalized_path(url_object)

url_object.url.to_s
end

if result.url.path.empty?
result.url.path = "/"
def extract_normalized_scheme(url_object)
url_object.url.scheme.downcase
end

def extract_normalized_host(url_object)
host = url_object.url.normalized_host
domain = url_object.domain

unless domain.subdomain?
host = "www.#{host}"
end

result.url.to_s.downcase
host = normalize_blogspot(host, domain)
host = host.downcase

host
end

def extract_normalized_path(url_object)
path = strip_trailing_slashes(url_object.url.path)

(path.empty?) ? "/" : path
end

def strip_trailing_slashes(path)
path.sub(ENDS_WITH_SLASH, "")
end

def normalize_blogspot(host, domain)
if domain.sld.downcase == "blogspot"
host.sub(/\Awww\./i, "").sub(/#{domain.tld}\z/i, "com")
else
host
end
end
end
end
Expand Down
178 changes: 142 additions & 36 deletions spec/lib/twingly/url/normalization_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,55 +12,43 @@
expect { normalizer.normalize([]) }.not_to raise_error
end

it "handles URL with ] in it" do
url = "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy"
expect { normalizer.normalize(url) }.not_to raise_error
it "does not create URLs for normal words" do
url = "This is, just, some words. Yay!"
expect(normalizer.normalize(url)).to eq([])
end

it "handles URL with reference to another URL in it" do
url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun"
expect { normalizer.normalize(url) }.not_to raise_error
end
it "invokes .normalize_url for each url in an Array" do
urls = %w(http://blog.twingly.com/ http://twingly.com/)

it "handles URL with umlauts in host" do
url = "http://www.åäö.se/"
expect(normalizer.normalize(url)).to eq([url])
end
expect(normalizer).to receive(:normalize_url).with(urls.first)
expect(normalizer).to receive(:normalize_url).with(urls.last)

it "handles URL with umlauts in path" do
url = "http://www.aoo.se/öö"
expect(normalizer.normalize(url)).to eq([url])
normalizer.normalize(urls)
end

it "does not blow up when there's only protocol in the text" do
url = "http://"
expect { normalizer.normalize(url) }.not_to raise_error
end
it "invokes .normalize_url for each url in a String" do
urls = %w(http://blog.twingly.com/ http://twingly.com/)

it "does not blow up when there's no URL in the text" do
url = "Just some text"
expect { normalizer.normalize(url) }.not_to raise_error
end
expect(normalizer).to receive(:normalize_url).with(urls.first)
expect(normalizer).to receive(:normalize_url).with(urls.last)

it "does not create URLs for normal words" do
url = "This is, just, some words. Yay!"
expect(normalizer.normalize(url)).to eq([])
normalizer.normalize(urls.join(" "))
end
end

describe ".extract_urls" do
let(:urls) { %w(http://blog.twingly.com/ http://twingly.com/) }

it "detects two urls in a String" do
urls = "http://blog.twingly.com/ http://twingly.com/"
response = normalizer.extract_urls(urls)
response = normalizer.extract_urls(urls.join(" "))

expect(response.size).to eq(2)
expect(response.size).to eq(urls.size)
end

it "detects two urls in an Array" do
urls = %w(http://blog.twingly.com/ http://twingly.com/)
response = normalizer.extract_urls(urls)

expect(response.size).to eq(2)
expect(response.size).to eq(urls.size)
end

it "always returns an Array" do
Expand All @@ -83,19 +71,46 @@
expect(normalizer.normalize_url(url)).to eq(url)
end

it "does not remove www if the host has a subdomain" do
url = "http://www.blog.twingly.com/"

expect(normalizer.normalize_url(url)).to eq(url)
end

it "keeps www if the host already has it" do
url = "http://www.twingly.com/"

expect(normalizer.normalize_url(url)).to eq(url)
end

it "adds a trailing slash if missing" do
it "adds a trailing slash if missing in origin" do
url = "http://www.twingly.com"
expected = "http://www.twingly.com/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "ensures single trailing slash in origin" do
url = "http://www.twingly.com//"
expected = "http://www.twingly.com/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "removes trailing slash from path" do
url = "http://www.twingly.com/blog-data/"
expected = "http://www.twingly.com/blog-data"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "is able to normalize a url with double slash in path" do
url = "www.twingly.com/path//"
expected = "http://www.twingly.com/path"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "is able to normalize a url without protocol" do
url = "www.twingly.com/"
expected = "http://www.twingly.com/"
Expand All @@ -106,20 +121,111 @@
it "does not return broken URLs" do
url = "http://www.twingly."

expect(normalizer.normalize_url(url)).to eq(nil)
expect(normalizer.normalize_url(url)).to be_nil
end

it "oddly enough, does not alter URLs with consecutive dots" do
url = "http://www..twingly..com/"

expect(normalizer.normalize_url(url)).to eq(url)
end

it "does not add www. to blogspot blogs" do
it "does not add www. to blogspot URLs" do
url = "http://jlchen1026.blogspot.com/"

expect(normalizer.normalize_url(url)).to eq(url)
end

it "downcases the URL" do
url = "http://www.Twingly.com/"
expected = url.downcase
it "removes www. from blogspot URLs" do
url = "http://www.jlchen1026.blogspot.com/"
expected = "http://jlchen1026.blogspot.com/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "rewrites blogspot TLDs to .com" do
url = "http://WWW.jlchen1026.blogspot.CO.UK/"
expected = "http://jlchen1026.blogspot.com/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "downcases the protocol" do
url = "HTTPS://www.twingly.com/"
expected = "https://www.twingly.com/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "downcases the domain" do
url = "http://WWW.TWINGLY.COM/"
expected = "http://www.twingly.com/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "does not downcase the path" do
url = "http://www.twingly.com/PaTH"

expect(normalizer.normalize_url(url)).to eq(url)
end

it "does not downcase fragment" do
url = "http://www.twingly.com/#FRAGment"

expect(normalizer.normalize_url(url)).to eq(url)
end

it "handles URL with ] in it" do
url = "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy"
expect { normalizer.normalize_url(url) }.not_to raise_error
end

it "handles URL with reference to another URL in it" do
url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun"
expect { normalizer.normalize_url(url) }.not_to raise_error
end

it "handles URL with umlauts in host" do
url = "http://www.åäö.se/"
expected = "http://www.xn--4cab6c.se/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "handles URL with umlauts in path" do
url = "http://www.aoo.se/öö"
expect(normalizer.normalize_url(url)).to eq(url)
end

it "handles URL with punycoded SLD" do
url = "http://www.xn--4cab6c.se/"

expect(normalizer.normalize_url(url)).to eq(url)
end

it "handles URL with punycoded TLD" do
url = "http://example.xn--p1ai/"
expected = "http://www.example.xn--p1ai/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "converts to a punycoded URL" do
url = "скраповыймир.рф"
expected = "http://www.xn--80aesdcplhhhb0k.xn--p1ai/"

expect(normalizer.normalize_url(url)).to eq(expected)
end

it "does not blow up when there's only protocol in the text" do
url = "http://"
expect { normalizer.normalize_url(url) }.not_to raise_error
end

it "does not blow up when there's no URL in the text" do
url = "Just some text"
expect(normalizer.normalize_url(url)).to be_nil
end
end
end
6 changes: 6 additions & 0 deletions spec/lib/twingly/url/url_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
expect(described_class.parse(valid_url).valid?).to be true
end
end

it "handles nil input" do
actual = described_class.parse(nil)
expect(actual.url).to be_nil
expect(actual.domain).to be_nil
end
end
end

Expand Down