twingly · roback · Sep 9, 2015 · Sep 8, 2015 · Sep 8, 2015 · Sep 8, 2015
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,5 @@
 language: ruby
+sudo: false
 
 rvm:
   - 2.2

diff --git a/lib/twingly/url.rb b/lib/twingly/url.rb
@@ -21,16 +21,23 @@ def parse(potential_url)
     end
 
     def extract_url_and_domain(potential_url)
-      url    = Addressable::URI.heuristic_parse(potential_url)
-      domain = PublicSuffix.parse(url.host) if url
+      addressable_uri = Addressable::URI.heuristic_parse(potential_url)
 
-      [url, domain]
+      return invalid_url unless addressable_uri
+
+      domain = PublicSuffix.parse(addressable_uri.display_uri.host)
+
+      [addressable_uri, domain]
     rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
-      []
+      invalid_url
     end
 
     def validate(potential_url)
       parse(potential_url).valid?
     end
+
+    def invalid_url
+      [nil, nil]
+    end
   end
 end
diff --git a/lib/twingly/url/normalizer.rb b/lib/twingly/url/normalizer.rb
@@ -5,6 +5,8 @@ module URL
     module Normalizer
       module_function
 
+      ENDS_WITH_SLASH = /\/+$/
+
       def normalize(potential_urls)
         extract_urls(potential_urls).map do |potential_url|
           normalize_url(potential_url)
@@ -16,19 +18,51 @@ def extract_urls(potential_urls)
       end
 
       def normalize_url(potential_url)
-        result = Twingly::URL.parse(potential_url)
+        url_object = Twingly::URL.parse(potential_url)
 
-        return nil unless result.valid?
+        return nil unless url_object.valid?
 
-        unless result.domain.subdomain?
-          result.url.host = "www.#{result.domain}"
-        end
+        url_object.url.scheme = extract_normalized_scheme(url_object)
+        url_object.url.host   = extract_normalized_host(url_object)
+        url_object.url.path   = extract_normalized_path(url_object)
+
+        url_object.url.to_s
+      end
 
-        if result.url.path.empty?
-          result.url.path = "/"
+      def extract_normalized_scheme(url_object)
+        url_object.url.scheme.downcase
+      end
+
+      def extract_normalized_host(url_object)
+        host   = url_object.url.normalized_host
+        domain = url_object.domain
+
+        unless domain.subdomain?
+          host = "www.#{host}"
         end
 
-        result.url.to_s.downcase
+        host = normalize_blogspot(host, domain)
+        host = host.downcase
+
+        host
+      end
+
+      def extract_normalized_path(url_object)
+        path = strip_trailing_slashes(url_object.url.path)
+
+        (path.empty?) ? "/" : path
+      end
+
+      def strip_trailing_slashes(path)
+        path.sub(ENDS_WITH_SLASH, "")
+      end
+
+      def normalize_blogspot(host, domain)
+        if domain.sld.downcase == "blogspot"
+          host.sub(/\Awww\./i, "").sub(/#{domain.tld}\z/i, "com")
+        else
+          host
+        end
       end
     end
   end

diff --git a/spec/lib/twingly/url/normalization_spec.rb b/spec/lib/twingly/url/normalization_spec.rb
@@ -12,55 +12,43 @@
       expect { normalizer.normalize([]) }.not_to raise_error
     end
 
-    it "handles URL with ] in it" do
-      url = "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy"
-      expect { normalizer.normalize(url) }.not_to raise_error
+    it "does not create URLs for normal words" do
+      url = "This is, just, some words. Yay!"
+      expect(normalizer.normalize(url)).to eq([])
     end
 
-    it "handles URL with reference to another URL in it" do
-      url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun"
-      expect { normalizer.normalize(url) }.not_to raise_error
-    end
+    it "invokes .normalize_url for each url in an Array" do
+      urls = %w(http://blog.twingly.com/ http://twingly.com/)
 
-    it "handles URL with umlauts in host" do
-      url = "http://www.åäö.se/"
-      expect(normalizer.normalize(url)).to eq([url])
-    end
+      expect(normalizer).to receive(:normalize_url).with(urls.first)
+      expect(normalizer).to receive(:normalize_url).with(urls.last)
 
-    it "handles URL with umlauts in path" do
-      url = "http://www.aoo.se/öö"
-      expect(normalizer.normalize(url)).to eq([url])
+      normalizer.normalize(urls)
     end
 
-    it "does not blow up when there's only protocol in the text" do
-      url = "http://"
-      expect { normalizer.normalize(url) }.not_to raise_error
-    end
+    it "invokes .normalize_url for each url in a String" do
+      urls = %w(http://blog.twingly.com/ http://twingly.com/)
 
-    it "does not blow up when there's no URL in the text" do
-      url = "Just some text"
-      expect { normalizer.normalize(url) }.not_to raise_error
-    end
+      expect(normalizer).to receive(:normalize_url).with(urls.first)
+      expect(normalizer).to receive(:normalize_url).with(urls.last)
 
-    it "does not create URLs for normal words" do
-      url = "This is, just, some words. Yay!"
-      expect(normalizer.normalize(url)).to eq([])
+      normalizer.normalize(urls.join(" "))
     end
   end
 
   describe ".extract_urls" do
+    let(:urls) { %w(http://blog.twingly.com/ http://twingly.com/) }
+
     it "detects two urls in a String" do
-      urls = "http://blog.twingly.com/ http://twingly.com/"
-      response = normalizer.extract_urls(urls)
+      response = normalizer.extract_urls(urls.join(" "))
 
-      expect(response.size).to eq(2)
+      expect(response.size).to eq(urls.size)
     end
 
     it "detects two urls in an Array" do
-      urls = %w(http://blog.twingly.com/ http://twingly.com/)
       response = normalizer.extract_urls(urls)
 
-      expect(response.size).to eq(2)
+      expect(response.size).to eq(urls.size)
     end
 
     it "always returns an Array" do
@@ -83,19 +71,46 @@
       expect(normalizer.normalize_url(url)).to eq(url)
     end
 
+    it "does not remove www if the host has a subdomain" do
+      url = "http://www.blog.twingly.com/"
+
+      expect(normalizer.normalize_url(url)).to eq(url)
+    end
+
     it "keeps www if the host already has it" do
       url = "http://www.twingly.com/"
 
       expect(normalizer.normalize_url(url)).to eq(url)
     end
 
-    it "adds a trailing slash if missing" do
+    it "adds a trailing slash if missing in origin" do
       url = "http://www.twingly.com"
       expected = "http://www.twingly.com/"
 
       expect(normalizer.normalize_url(url)).to eq(expected)
     end
 
+    it "ensures single trailing slash in origin" do
+      url = "http://www.twingly.com//"
+      expected = "http://www.twingly.com/"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "removes trailing slash from path" do
+      url = "http://www.twingly.com/blog-data/"
+      expected = "http://www.twingly.com/blog-data"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "is able to normalize a url with double slash in path" do
+      url = "www.twingly.com/path//"
+      expected = "http://www.twingly.com/path"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
     it "is able to normalize a url without protocol" do
       url = "www.twingly.com/"
       expected = "http://www.twingly.com/"
@@ -106,20 +121,111 @@
     it "does not return broken URLs" do
       url = "http://www.twingly."
 
-      expect(normalizer.normalize_url(url)).to eq(nil)
+      expect(normalizer.normalize_url(url)).to be_nil
+    end
+
+    it "oddly enough, does not alter URLs with consecutive dots" do
+      url = "http://www..twingly..com/"
+
+      expect(normalizer.normalize_url(url)).to eq(url)
     end
 
-    it "does not add www. to blogspot blogs" do
+    it "does not add www. to blogspot URLs" do
       url = "http://jlchen1026.blogspot.com/"
 
       expect(normalizer.normalize_url(url)).to eq(url)
     end
 
-    it "downcases the URL" do
-      url = "http://www.Twingly.com/"
-      expected = url.downcase
+    it "removes www. from blogspot URLs" do
+      url = "http://www.jlchen1026.blogspot.com/"
+      expected = "http://jlchen1026.blogspot.com/"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "rewrites blogspot TLDs to .com" do
+      url = "http://WWW.jlchen1026.blogspot.CO.UK/"
+      expected = "http://jlchen1026.blogspot.com/"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "downcases the protocol" do
+      url = "HTTPS://www.twingly.com/"
+      expected = "https://www.twingly.com/"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "downcases the domain" do
+      url = "http://WWW.TWINGLY.COM/"
+      expected = "http://www.twingly.com/"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "does not downcase the path" do
+      url = "http://www.twingly.com/PaTH"
+
+      expect(normalizer.normalize_url(url)).to eq(url)
+    end
+
+    it "does not downcase fragment" do
+      url = "http://www.twingly.com/#FRAGment"
+
+      expect(normalizer.normalize_url(url)).to eq(url)
+    end
+
+    it "handles URL with ] in it" do
+      url = "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy"
+      expect { normalizer.normalize_url(url) }.not_to raise_error
+    end
+
+    it "handles URL with reference to another URL in it" do
+      url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun"
+      expect { normalizer.normalize_url(url) }.not_to raise_error
+    end
+
+    it "handles URL with umlauts in host" do
+      url = "http://www.åäö.se/"
+      expected = "http://www.xn--4cab6c.se/"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "handles URL with umlauts in path" do
+      url = "http://www.aoo.se/öö"
+      expect(normalizer.normalize_url(url)).to eq(url)
+    end
+
+    it "handles URL with punycoded SLD" do
+      url = "http://www.xn--4cab6c.se/"
+
+      expect(normalizer.normalize_url(url)).to eq(url)
+    end
+
+    it "handles URL with punycoded TLD" do
+      url = "http://example.xn--p1ai/"
+      expected = "http://www.example.xn--p1ai/"
 
       expect(normalizer.normalize_url(url)).to eq(expected)
     end
+
+    it "converts to a punycoded URL" do
+      url = "скраповыймир.рф"
+      expected = "http://www.xn--80aesdcplhhhb0k.xn--p1ai/"
+
+      expect(normalizer.normalize_url(url)).to eq(expected)
+    end
+
+    it "does not blow up when there's only protocol in the text" do
+      url = "http://"
+      expect { normalizer.normalize_url(url) }.not_to raise_error
+    end
+
+    it "does not blow up when there's no URL in the text" do
+      url = "Just some text"
+      expect(normalizer.normalize_url(url)).to be_nil
+    end
   end
 end
diff --git a/spec/lib/twingly/url/url_spec.rb b/spec/lib/twingly/url/url_spec.rb
@@ -20,6 +20,12 @@
           expect(described_class.parse(valid_url).valid?).to be true
         end
       end
+
+      it "handles nil input" do
+        actual = described_class.parse(nil)
+        expect(actual.url).to be_nil
+        expect(actual.domain).to be_nil
+      end
     end
   end