From 3f1deac7116c57507cafb286343faa4f9ddc43b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 14:26:46 +0100 Subject: [PATCH 01/21] Add initial tests Using minitest with some extras: * Turn for more informative run output * Shoulda for context and matchers Turn: https://github.com/turn-project/turn Shoulda: https://github.com/thoughtbot/shoulda --- Gemfile | 3 ++ Gemfile.lock | 51 +++++++++++++++++++++++++++++++++ Rakefile | 19 ++++++++++++ test/test_helper.rb | 5 ++++ test/unit/normalization_test.rb | 29 +++++++++++++++++++ twingly-url-normalizer.gemspec | 4 +++ 6 files changed, 111 insertions(+) create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 Rakefile create mode 100644 test/test_helper.rb create mode 100644 test/unit/normalization_test.rb diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..e088013 --- /dev/null +++ b/Gemfile @@ -0,0 +1,3 @@ +source 'https://rubygems.org/' + +gemspec diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..bb3002e --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,51 @@ +PATH + remote: . + specs: + twingly-url-normalizer (0.0.1) + domainatrix + postrank-uri + +GEM + remote: https://rubygems.org/ + specs: + activesupport (4.0.3) + i18n (~> 0.6, >= 0.6.4) + minitest (~> 4.2) + multi_json (~> 1.3) + thread_safe (~> 0.1) + tzinfo (~> 0.3.37) + addressable (2.3.5) + ansi (1.4.3) + atomic (1.1.14) + domainatrix (0.0.11) + addressable + i18n (0.6.9) + minitest (4.7.5) + multi_json (1.8.4) + nokogiri (1.5.11) + postrank-uri (1.0.17) + addressable (~> 2.3.0) + nokogiri (~> 1.5.5) + public_suffix (~> 1.1.3) + public_suffix (1.1.3) + rake (10.1.1) + shoulda (3.5.0) + shoulda-context (~> 1.0, >= 1.0.1) + shoulda-matchers (>= 1.4.1, < 3.0) + shoulda-context (1.1.6) + shoulda-matchers (2.5.0) + activesupport (>= 3.0.0) + thread_safe (0.1.3) + atomic + turn (0.9.6) + ansi + tzinfo (0.3.38) + +PLATFORMS + ruby + +DEPENDENCIES + rake + shoulda + turn + twingly-url-normalizer! diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..d07c820 --- /dev/null +++ b/Rakefile @@ -0,0 +1,19 @@ +require 'bundler/setup' + +task default: 'test:unit' +task test: 'test:unit' + +require 'rake/testtask' +namespace :test do + Rake::TestTask.new(:unit) do |test| + test.pattern = "test/unit/*_test.rb" + test.libs << 'lib' + test.libs << 'test' + end + + Rake::TestTask.new(:profile) do |test| + test.pattern = "test/profile/*_test.rb" + test.libs << 'lib' + test.libs << 'test' + end +end diff --git a/test/test_helper.rb b/test/test_helper.rb new file mode 100644 index 0000000..d66b967 --- /dev/null +++ b/test/test_helper.rb @@ -0,0 +1,5 @@ +require 'bundler/setup' +require 'turn/autorun' +require 'shoulda' + +require 'twingly-url-normalizer' diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb new file mode 100644 index 0000000..c4da085 --- /dev/null +++ b/test/unit/normalization_test.rb @@ -0,0 +1,29 @@ +require 'test_helper' + +class NormalizerTest < Test::Unit::TestCase + context "normalize" do + setup do + @normalizer = Twingly::URL::Normalizer + end + + should "detect two urls in a String" do + urls = "http://blog.twingly.com/ http://twingly.com/" + response = @normalizer.normalize(urls) + + response.size.must_equal 2 + end + + should "detect two urls in an Array" do + urls = %w(http://blog.twingly.com/ http://twingly.com/) + response = @normalizer.normalize(urls) + + response.size.must_equal 2 + end + + should "return an Array" do + response = @normalizer.normalize(nil) + + response.must_be_instance_of Array + end + end +end diff --git a/twingly-url-normalizer.gemspec b/twingly-url-normalizer.gemspec index ac309f4..55a7f77 100644 --- a/twingly-url-normalizer.gemspec +++ b/twingly-url-normalizer.gemspec @@ -15,6 +15,10 @@ Gem::Specification.new do |s| s.add_dependency "postrank-uri" s.add_dependency "domainatrix" + s.add_development_dependency "turn" + s.add_development_dependency "rake" + s.add_development_dependency "shoulda" + s.files = Dir.glob("{lib}/**/*") + %w(README.md) s.require_path = 'lib' end From f9a607234acad92c2480d501a8d1f7f8a9ebd9c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 14:31:44 +0100 Subject: [PATCH 02/21] Test on Travis CI --- .travis.yml | 14 ++++++++++++++ README.md | 2 ++ 2 files changed, 16 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..5457554 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +language: ruby + +rvm: + - 2.0.0 + +cache: bundler + +notifications: + email: false + hipchat: + rooms: 0715dd54b78b69f7dc310969a35036@208408 + on_success: never + on_failure: change + template: '%{repository}#%{build_number} (%{branch} - %{commit} : %{author}): %{message}' diff --git a/README.md b/README.md index 91585c5..b2b0017 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # twingly-url-normalizer +[![Build Status](https://magnum.travis-ci.com/twingly/twingly-url-normalizer.png?token=ADz8fWxRD3uP4KZPPZQS&branch=master)](https://magnum.travis-ci.com/twingly/twingly-url-normalizer) + Ruby gem for URL normalization ## Example From cdbab00e705a6c689349b905bac7e9afee18fd52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 14:54:30 +0100 Subject: [PATCH 03/21] Refactor .normalize --- lib/twingly-url-normalizer.rb | 24 ++++++++++------ test/unit/normalization_test.rb | 49 ++++++++++++++++++++++++++++++--- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/lib/twingly-url-normalizer.rb b/lib/twingly-url-normalizer.rb index 584d40b..bd0055b 100644 --- a/lib/twingly-url-normalizer.rb +++ b/lib/twingly-url-normalizer.rb @@ -8,16 +8,24 @@ module Twingly module URL class Normalizer - def self.normalize(potential_url) - PostRank::URI.extract(potential_url).map do |url| - subdomain = Domainatrix.parse(url).subdomain - uri = URI.parse(url) - if subdomain.empty? - uri.host = "www.#{uri.host}" - end - uri.to_s + def self.normalize(potential_urls) + extract_urls(potential_urls).map do |url| + normalize_url(url) end end + + def self.extract_urls(potential_urls) + PostRank::URI.extract(potential_urls) + end + + def self.normalize_url(url) + subdomain = Domainatrix.parse(url).subdomain + uri = URI.parse(url) + if subdomain.empty? + uri.host = "www.#{uri.host}" + end + uri.to_s + end end end end diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb index c4da085..bb0faf0 100644 --- a/test/unit/normalization_test.rb +++ b/test/unit/normalization_test.rb @@ -1,29 +1,70 @@ require 'test_helper' class NormalizerTest < Test::Unit::TestCase - context "normalize" do + context ".normalize" do + setup do + @normalizer = Twingly::URL::Normalizer + end + + should "accept a String" do + assert @normalizer.normalize("") + end + + should "accept an Array" do + assert @normalizer.normalize([]) + end + end + + context ".extract_urls" do setup do @normalizer = Twingly::URL::Normalizer end should "detect two urls in a String" do urls = "http://blog.twingly.com/ http://twingly.com/" - response = @normalizer.normalize(urls) + response = @normalizer.extract_urls(urls) response.size.must_equal 2 end should "detect two urls in an Array" do urls = %w(http://blog.twingly.com/ http://twingly.com/) - response = @normalizer.normalize(urls) + response = @normalizer.extract_urls(urls) response.size.must_equal 2 end should "return an Array" do - response = @normalizer.normalize(nil) + response = @normalizer.extract_urls(nil) response.must_be_instance_of Array end end + + context ".normalize_url" do + setup do + @normalizer = Twingly::URL::Normalizer + end + + should "add www if host is missing a subdomain" do + url = "http://twingly.com/" + result = @normalizer.normalize_url(url) + + assert_equal "http://www.twingly.com/", result + end + + should "not add www if the host has a subdomain" do + url = "http://blog.twingly.com/" + result = @normalizer.normalize_url(url) + + assert_equal "http://blog.twingly.com/", result + end + + should "keep www if the host already has it" do + url = "http://www.twingly.com/" + result = @normalizer.normalize_url(url) + + assert_equal "http://www.twingly.com/", result + end + end end From f6bd680856407b7c36e208bc7dc2ee9226603a08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 14:55:53 +0100 Subject: [PATCH 04/21] Remove TODO, not sure it should handle that --- lib/twingly-url-normalizer.rb | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/twingly-url-normalizer.rb b/lib/twingly-url-normalizer.rb index bd0055b..6f455fb 100644 --- a/lib/twingly-url-normalizer.rb +++ b/lib/twingly-url-normalizer.rb @@ -2,9 +2,6 @@ require 'domainatrix' require 'uri' -# TODO -# * Handle blogspot.se -> blogspot.com - module Twingly module URL class Normalizer From 39c6dcab88adf8aa0c924881620fe0b2ee651945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 14:56:40 +0100 Subject: [PATCH 05/21] Add test note in README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index b2b0017..5a4391e 100644 --- a/README.md +++ b/README.md @@ -12,3 +12,9 @@ Ruby gem for URL normalization [6] pry(main)> Twingly::URL::Normalizer.normalize('duh.se') => ["http://www.duh.se/"] ``` + +## Tests + +Run tests with + + bundle exec rake From 13411ef7d6b7ad50116f5f380b6fb16a3bfecc84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 15:06:16 +0100 Subject: [PATCH 06/21] Add failing tests Broken URLs found during work with Zambezi --- test/unit/normalization_test.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb index bb0faf0..9b7a330 100644 --- a/test/unit/normalization_test.rb +++ b/test/unit/normalization_test.rb @@ -13,6 +13,16 @@ class NormalizerTest < Test::Unit::TestCase should "accept an Array" do assert @normalizer.normalize([]) end + + should "handle URL with ] in it" do + url = "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy" + assert @normalizer.normalize(url) + end + + should "handle URL with reference to another URL in it" do + url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun" + assert @normalizer.normalize(url) + end end context ".extract_urls" do From d230e8bc6838a1d63a1108e36f7388a52ee1cef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 15:15:05 +0100 Subject: [PATCH 07/21] Parse URI's with Addressable --- Gemfile.lock | 1 + lib/twingly-url-normalizer.rb | 4 ++-- twingly-url-normalizer.gemspec | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index bb3002e..4db3613 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -2,6 +2,7 @@ PATH remote: . specs: twingly-url-normalizer (0.0.1) + addressable domainatrix postrank-uri diff --git a/lib/twingly-url-normalizer.rb b/lib/twingly-url-normalizer.rb index 6f455fb..90933bd 100644 --- a/lib/twingly-url-normalizer.rb +++ b/lib/twingly-url-normalizer.rb @@ -1,6 +1,6 @@ require 'postrank-uri' require 'domainatrix' -require 'uri' +require 'addressable/uri' module Twingly module URL @@ -17,7 +17,7 @@ def self.extract_urls(potential_urls) def self.normalize_url(url) subdomain = Domainatrix.parse(url).subdomain - uri = URI.parse(url) + uri = Addressable::URI.parse(url) if subdomain.empty? uri.host = "www.#{uri.host}" end diff --git a/twingly-url-normalizer.gemspec b/twingly-url-normalizer.gemspec index 55a7f77..f1cc592 100644 --- a/twingly-url-normalizer.gemspec +++ b/twingly-url-normalizer.gemspec @@ -14,6 +14,7 @@ Gem::Specification.new do |s| s.add_dependency "postrank-uri" s.add_dependency "domainatrix" + s.add_dependency "addressable" s.add_development_dependency "turn" s.add_development_dependency "rake" From d2bbab6eb4b7e2ef43c461e12bb13af590f7dee7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 15:16:46 +0100 Subject: [PATCH 08/21] Remove shoulda, just use shoulda-context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shoulda includes shoulda-context and shoulda-matchers, we’re not using the matchar at this moment, so no need to pull it in (since it introduces lots of development dependencies). --- Gemfile.lock | 20 +------------------- test/test_helper.rb | 2 +- twingly-url-normalizer.gemspec | 2 +- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 4db3613..4c63007 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -9,20 +9,10 @@ PATH GEM remote: https://rubygems.org/ specs: - activesupport (4.0.3) - i18n (~> 0.6, >= 0.6.4) - minitest (~> 4.2) - multi_json (~> 1.3) - thread_safe (~> 0.1) - tzinfo (~> 0.3.37) addressable (2.3.5) ansi (1.4.3) - atomic (1.1.14) domainatrix (0.0.11) addressable - i18n (0.6.9) - minitest (4.7.5) - multi_json (1.8.4) nokogiri (1.5.11) postrank-uri (1.0.17) addressable (~> 2.3.0) @@ -30,23 +20,15 @@ GEM public_suffix (~> 1.1.3) public_suffix (1.1.3) rake (10.1.1) - shoulda (3.5.0) - shoulda-context (~> 1.0, >= 1.0.1) - shoulda-matchers (>= 1.4.1, < 3.0) shoulda-context (1.1.6) - shoulda-matchers (2.5.0) - activesupport (>= 3.0.0) - thread_safe (0.1.3) - atomic turn (0.9.6) ansi - tzinfo (0.3.38) PLATFORMS ruby DEPENDENCIES rake - shoulda + shoulda-context turn twingly-url-normalizer! diff --git a/test/test_helper.rb b/test/test_helper.rb index d66b967..a04efec 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -1,5 +1,5 @@ require 'bundler/setup' require 'turn/autorun' -require 'shoulda' +require 'shoulda-context' require 'twingly-url-normalizer' diff --git a/twingly-url-normalizer.gemspec b/twingly-url-normalizer.gemspec index f1cc592..de5f6a4 100644 --- a/twingly-url-normalizer.gemspec +++ b/twingly-url-normalizer.gemspec @@ -18,7 +18,7 @@ Gem::Specification.new do |s| s.add_development_dependency "turn" s.add_development_dependency "rake" - s.add_development_dependency "shoulda" + s.add_development_dependency "shoulda-context" s.files = Dir.glob("{lib}/**/*") + %w(README.md) s.require_path = 'lib' From a01ac749c6fb9201f64c6e183544d68aeeeced7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 15:20:51 +0100 Subject: [PATCH 09/21] Add failing umlaut tests From #2 --- test/unit/normalization_test.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb index 9b7a330..e46f7e2 100644 --- a/test/unit/normalization_test.rb +++ b/test/unit/normalization_test.rb @@ -23,6 +23,16 @@ class NormalizerTest < Test::Unit::TestCase url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun" assert @normalizer.normalize(url) end + + should "handle URL with umlauts in host" do + url = "http://www.åäö.se/" + assert_equal [url], @normalizer.normalize(url) + end + + should "handle URL with umlauts in path" do + url = "http://www.aoo.se/öö" + assert_equal [url], @normalizer.normalize(url) + end end context ".extract_urls" do From 19d28c628f45e40a58685f0932e28229d4566a09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 15:52:13 +0100 Subject: [PATCH 10/21] Remove Postrank::URI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PostRank::URI couldn’t handle umlauts. We will lose the feature to detect urls without protocol “twingly.com”, but we don’t see the need for this feature. On the plus side, lots of runtime dependencies are removed (nokogiri!). --- Gemfile.lock | 7 ------- lib/twingly-url-normalizer.rb | 3 +-- test/unit/normalization_test.rb | 5 +++++ twingly-url-normalizer.gemspec | 1 - 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 4c63007..eca3f76 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -4,7 +4,6 @@ PATH twingly-url-normalizer (0.0.1) addressable domainatrix - postrank-uri GEM remote: https://rubygems.org/ @@ -13,12 +12,6 @@ GEM ansi (1.4.3) domainatrix (0.0.11) addressable - nokogiri (1.5.11) - postrank-uri (1.0.17) - addressable (~> 2.3.0) - nokogiri (~> 1.5.5) - public_suffix (~> 1.1.3) - public_suffix (1.1.3) rake (10.1.1) shoulda-context (1.1.6) turn (0.9.6) diff --git a/lib/twingly-url-normalizer.rb b/lib/twingly-url-normalizer.rb index 90933bd..d9e8b0c 100644 --- a/lib/twingly-url-normalizer.rb +++ b/lib/twingly-url-normalizer.rb @@ -1,4 +1,3 @@ -require 'postrank-uri' require 'domainatrix' require 'addressable/uri' @@ -12,7 +11,7 @@ def self.normalize(potential_urls) end def self.extract_urls(potential_urls) - PostRank::URI.extract(potential_urls) + Array(potential_urls).map(&:split).flatten end def self.normalize_url(url) diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb index e46f7e2..b7935b7 100644 --- a/test/unit/normalization_test.rb +++ b/test/unit/normalization_test.rb @@ -86,5 +86,10 @@ class NormalizerTest < Test::Unit::TestCase assert_equal "http://www.twingly.com/", result end + + should "not be able to normalize url without protocol" do + url = "twingly.com/" + assert_raises(Addressable::URI::InvalidURIError) { @normalizer.normalize_url(url) } + end end end diff --git a/twingly-url-normalizer.gemspec b/twingly-url-normalizer.gemspec index de5f6a4..02f9347 100644 --- a/twingly-url-normalizer.gemspec +++ b/twingly-url-normalizer.gemspec @@ -12,7 +12,6 @@ Gem::Specification.new do |s| s.summary = "Ruby library for URL normalization" s.required_ruby_version = ">= 1.9.3" - s.add_dependency "postrank-uri" s.add_dependency "domainatrix" s.add_dependency "addressable" From 0225448055e6012144479895efdbda2162596531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 16:34:07 +0100 Subject: [PATCH 11/21] Add profiling Inspiration from elasticsearch-transport tests: https://github.com/elasticsearch/elasticsearch-ruby/blob/6f83143b8e6409a 2eaf451a4dabf2c64f25ade31/elasticsearch-transport/test/profile/client_be nchmark_test.rb --- Gemfile.lock | 2 ++ README.md | 10 ++++++++++ Rakefile | 1 + test/lib/test_profile.rb | 15 +++++++++++++++ test/profile/normalize_performance_test.rb | 10 ++++++++++ twingly-url-normalizer.gemspec | 1 + 6 files changed, 39 insertions(+) create mode 100644 test/lib/test_profile.rb create mode 100644 test/profile/normalize_performance_test.rb diff --git a/Gemfile.lock b/Gemfile.lock index eca3f76..fd5d5c3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -13,6 +13,7 @@ GEM domainatrix (0.0.11) addressable rake (10.1.1) + ruby-prof (0.14.2) shoulda-context (1.1.6) turn (0.9.6) ansi @@ -22,6 +23,7 @@ PLATFORMS DEPENDENCIES rake + ruby-prof shoulda-context turn twingly-url-normalizer! diff --git a/README.md b/README.md index 5a4391e..063772e 100644 --- a/README.md +++ b/README.md @@ -18,3 +18,13 @@ Ruby gem for URL normalization Run tests with bundle exec rake + +### Profiling + +You can get some profiling by running + + bundle exec rake test:profile + +Note that this isn't a benchmark, we're using [ruby-prof] which will slow things down. + +[ruby-prof]: http://ruby-prof.rubyforge.org/ diff --git a/Rakefile b/Rakefile index d07c820..a9b8975 100644 --- a/Rakefile +++ b/Rakefile @@ -15,5 +15,6 @@ namespace :test do test.pattern = "test/profile/*_test.rb" test.libs << 'lib' test.libs << 'test' + test.libs << 'test/lib' end end diff --git a/test/lib/test_profile.rb b/test/lib/test_profile.rb new file mode 100644 index 0000000..c705a3e --- /dev/null +++ b/test/lib/test_profile.rb @@ -0,0 +1,15 @@ +require 'ruby-prof' + +def measure(name, count, &block) + should "#{name} (#{count}x)" do + RubyProf.start + + count.times do + block.call + end + + result = RubyProf.stop + printer = RubyProf::FlatPrinter.new(result) + printer.print(STDOUT) + end +end diff --git a/test/profile/normalize_performance_test.rb b/test/profile/normalize_performance_test.rb new file mode 100644 index 0000000..dd29783 --- /dev/null +++ b/test/profile/normalize_performance_test.rb @@ -0,0 +1,10 @@ +require 'test_helper' +require 'test_profile' + +class NormalizerPerformanceTest < Test::Unit::TestCase + context ".normalize_url" do + measure "normalizing a short URL", 1000 do + Twingly::URL::Normalizer.normalize('http://www.duh.se/') + end + end +end diff --git a/twingly-url-normalizer.gemspec b/twingly-url-normalizer.gemspec index 02f9347..0d91557 100644 --- a/twingly-url-normalizer.gemspec +++ b/twingly-url-normalizer.gemspec @@ -18,6 +18,7 @@ Gem::Specification.new do |s| s.add_development_dependency "turn" s.add_development_dependency "rake" s.add_development_dependency "shoulda-context" + s.add_development_dependency "ruby-prof" s.files = Dir.glob("{lib}/**/*") + %w(README.md) s.require_path = 'lib' From 6d210a9f0b9e78cefc4bc622b17c07ef44376024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 18:00:30 +0100 Subject: [PATCH 12/21] Update README with a working example --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 063772e..ef37815 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,11 @@ Ruby gem for URL normalization ## Example ``` -[5] pry(main)> Twingly::URL::Normalizer.normalize('aoeu') -=> [] -[6] pry(main)> Twingly::URL::Normalizer.normalize('duh.se') -=> ["http://www.duh.se/"] +irb(main):001:0> Twingly::URL::Normalizer.normalize('http://duh.se') +=> ["http://www.duh.se"] +irb(main):002:0> Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/') +=> ["http://www.duh.se", "http://blog.twingly.com/"] +irb(main):003:0> ``` ## Tests From dfe29e1b4e864851be99eb75073d15a186ccef2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 18:02:16 +0100 Subject: [PATCH 13/21] Add test that fails when given text without any URLs in it --- test/unit/normalization_test.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb index b7935b7..bd0dc92 100644 --- a/test/unit/normalization_test.rb +++ b/test/unit/normalization_test.rb @@ -33,6 +33,11 @@ class NormalizerTest < Test::Unit::TestCase url = "http://www.aoo.se/öö" assert_equal [url], @normalizer.normalize(url) end + + should "should not blow up when there's no URL in the text" do + url = "Just some text" + assert @normalizer.normalize(url) + end end context ".extract_urls" do From 0e8a3cd263f51a4b3535ef886042e0ad75390efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 19:22:24 +0100 Subject: [PATCH 14/21] Remove Gemfile.lock Should not exist in gems --- .gitignore | 2 ++ Gemfile.lock | 29 ----------------------------- 2 files changed, 2 insertions(+), 29 deletions(-) delete mode 100644 Gemfile.lock diff --git a/.gitignore b/.gitignore index 560d1a6..5dcc5c7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ tmp .yardoc _yardoc doc/ + +Gemfile.lock diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index fd5d5c3..0000000 --- a/Gemfile.lock +++ /dev/null @@ -1,29 +0,0 @@ -PATH - remote: . - specs: - twingly-url-normalizer (0.0.1) - addressable - domainatrix - -GEM - remote: https://rubygems.org/ - specs: - addressable (2.3.5) - ansi (1.4.3) - domainatrix (0.0.11) - addressable - rake (10.1.1) - ruby-prof (0.14.2) - shoulda-context (1.1.6) - turn (0.9.6) - ansi - -PLATFORMS - ruby - -DEPENDENCIES - rake - ruby-prof - shoulda-context - turn - twingly-url-normalizer! From 826da7dd0dd59eeda4d29df9d1ccad5f8120545e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 19:25:10 +0100 Subject: [PATCH 15/21] Add failing test for old URL behavior In 19d28c628f45e40a58685f0932e28229d4566a09 when I removed Postrank::URI, I removed the feature that detected URLs without protocol. This commits enables tests for it again. --- test/unit/normalization_test.rb | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb index bd0dc92..f5063f1 100644 --- a/test/unit/normalization_test.rb +++ b/test/unit/normalization_test.rb @@ -38,6 +38,11 @@ class NormalizerTest < Test::Unit::TestCase url = "Just some text" assert @normalizer.normalize(url) end + + should "should not create URLs for normal words" do + url = "This is, just, some words. Yay!" + assert_equal [], @normalizer.normalize(url) + end end context ".extract_urls" do @@ -92,9 +97,18 @@ class NormalizerTest < Test::Unit::TestCase assert_equal "http://www.twingly.com/", result end - should "not be able to normalize url without protocol" do - url = "twingly.com/" - assert_raises(Addressable::URI::InvalidURIError) { @normalizer.normalize_url(url) } + should "be able to normalize url without protocol" do + url = "www.twingly.com/" + result = @normalizer.normalize_url(url) + + assert_equal "http://www.twingly.com/", result + end + + should "not return broken URLs" do + url = "http://www.twingly." + result = @normalizer.normalize_url(url) + + assert_equal nil, result end end end From e276dd4ee833cadc5c591e906b43108a2f533cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 19:26:36 +0100 Subject: [PATCH 16/21] Detect URLs without protocol Enabled the behavior removed in 19d28c628f45e40a58685f0932e28229d4566a09 This uses PublicSuffix and Addressable instead of Postrank::URI though. Why? Postrank::URI was very slow, this is also slow, but not quite as slow. --- lib/twingly-url-normalizer.rb | 18 ++++++++++-------- twingly-url-normalizer.gemspec | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/twingly-url-normalizer.rb b/lib/twingly-url-normalizer.rb index d9e8b0c..bf6fac0 100644 --- a/lib/twingly-url-normalizer.rb +++ b/lib/twingly-url-normalizer.rb @@ -1,5 +1,5 @@ -require 'domainatrix' require 'addressable/uri' +require 'public_suffix' module Twingly module URL @@ -7,20 +7,22 @@ class Normalizer def self.normalize(potential_urls) extract_urls(potential_urls).map do |url| normalize_url(url) - end + end.compact end - def self.extract_urls(potential_urls) + def self.extract_urls (potential_urls) Array(potential_urls).map(&:split).flatten end - def self.normalize_url(url) - subdomain = Domainatrix.parse(url).subdomain - uri = Addressable::URI.parse(url) - if subdomain.empty? - uri.host = "www.#{uri.host}" + def self.normalize_url(potential_url) + uri = Addressable::URI.heuristic_parse(potential_url) + domain = PublicSuffix.parse(uri.host) + + unless domain.subdomain? + uri.host = "www.#{domain}" end uri.to_s + rescue PublicSuffix::DomainInvalid end end end diff --git a/twingly-url-normalizer.gemspec b/twingly-url-normalizer.gemspec index 0d91557..5089214 100644 --- a/twingly-url-normalizer.gemspec +++ b/twingly-url-normalizer.gemspec @@ -12,8 +12,8 @@ Gem::Specification.new do |s| s.summary = "Ruby library for URL normalization" s.required_ruby_version = ">= 1.9.3" - s.add_dependency "domainatrix" s.add_dependency "addressable" + s.add_dependency "public_suffix", "~> 1.4.0" s.add_development_dependency "turn" s.add_development_dependency "rake" From 36ded276d3c39e182171f5ac51c2c72df9fc7877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 19:35:46 +0100 Subject: [PATCH 17/21] Refactor --- lib/twingly-url-normalizer.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/twingly-url-normalizer.rb b/lib/twingly-url-normalizer.rb index bf6fac0..102956a 100644 --- a/lib/twingly-url-normalizer.rb +++ b/lib/twingly-url-normalizer.rb @@ -5,12 +5,12 @@ module Twingly module URL class Normalizer def self.normalize(potential_urls) - extract_urls(potential_urls).map do |url| - normalize_url(url) + extract_urls(potential_urls).map do |potential_url| + normalize_url(potential_url) end.compact end - def self.extract_urls (potential_urls) + def self.extract_urls(potential_urls) Array(potential_urls).map(&:split).flatten end From 0e0137e0a0d78631f2de532718e8e8bf7890a6fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 19:44:35 +0100 Subject: [PATCH 18/21] Fix performance test, should be testing normalize_url --- test/profile/normalize_performance_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/profile/normalize_performance_test.rb b/test/profile/normalize_performance_test.rb index dd29783..dc474a3 100644 --- a/test/profile/normalize_performance_test.rb +++ b/test/profile/normalize_performance_test.rb @@ -4,7 +4,7 @@ class NormalizerPerformanceTest < Test::Unit::TestCase context ".normalize_url" do measure "normalizing a short URL", 1000 do - Twingly::URL::Normalizer.normalize('http://www.duh.se/') + Twingly::URL::Normalizer.normalize_url('http://www.duh.se/') end end end From 077cc17b78d8bdd9625e49bafaf01712ff792335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Thu, 20 Feb 2014 20:02:15 +0100 Subject: [PATCH 19/21] Use RubyProf::MultiPrinter, create profile files in tmp --- test/lib/test_profile.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/lib/test_profile.rb b/test/lib/test_profile.rb index c705a3e..fe8ee0d 100644 --- a/test/lib/test_profile.rb +++ b/test/lib/test_profile.rb @@ -9,7 +9,7 @@ def measure(name, count, &block) end result = RubyProf.stop - printer = RubyProf::FlatPrinter.new(result) - printer.print(STDOUT) + printer = RubyProf::MultiPrinter.new(result) + printer.print(path: 'tmp') end end From 3682905786f50e740a093c665cb6a34eb20c0fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Fri, 21 Feb 2014 13:55:28 +0100 Subject: [PATCH 20/21] Make sure we always have a path Insert / if no path exist. --- lib/twingly-url-normalizer.rb | 5 +++++ test/unit/normalization_test.rb | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/lib/twingly-url-normalizer.rb b/lib/twingly-url-normalizer.rb index 102956a..ed44501 100644 --- a/lib/twingly-url-normalizer.rb +++ b/lib/twingly-url-normalizer.rb @@ -21,6 +21,11 @@ def self.normalize_url(potential_url) unless domain.subdomain? uri.host = "www.#{domain}" end + + if uri.path.empty? + uri.path = "/" + end + uri.to_s rescue PublicSuffix::DomainInvalid end diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb index f5063f1..fb997a9 100644 --- a/test/unit/normalization_test.rb +++ b/test/unit/normalization_test.rb @@ -97,6 +97,13 @@ class NormalizerTest < Test::Unit::TestCase assert_equal "http://www.twingly.com/", result end + should "add an ending slash if missing" do + url = "http://www.twingly.com" + result = @normalizer.normalize_url(url) + + assert_equal "http://www.twingly.com/", result + end + should "be able to normalize url without protocol" do url = "www.twingly.com/" result = @normalizer.normalize_url(url) From 72d052157e9202220db8e80d997a4188c8838f14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Eckerstr=C3=B6m?= Date: Fri, 21 Feb 2014 17:41:44 +0100 Subject: [PATCH 21/21] Bump to 1.0.0 since behavior has been changed --- lib/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/version.rb b/lib/version.rb index bbb1ba2..fb17869 100644 --- a/lib/version.rb +++ b/lib/version.rb @@ -1,7 +1,7 @@ module Twingly module URL class Normalizer - VERSION = '0.0.1' + VERSION = '1.0.0' end end end