From 4e69dfc486ad4e9633d2fac7b6bbb62734ad4855 Mon Sep 17 00:00:00 2001
From: Tomasz Babel <tomaszbabel@o2.pl>
Date: Mon, 20 Feb 2012 18:57:08 +0100
Subject: [PATCH] Improved check of links uniqueness.

---
 spec.txt | 50 +++++++++++++++++++++++++++++++---
 yawc.rb  | 82 +++++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 116 insertions(+), 16 deletions(-)

diff --git a/spec.txt b/spec.txt
index 38a7440..1751df1 100644
--- a/spec.txt
+++ b/spec.txt
@@ -1,10 +1,52 @@
 v0.1
+====
 1. Wczytaj stronę.
+
 2. Adres strony pobierz z wiersza poleceń (argument podawany przy wywołaniu).
+
 3. Wyświetl wszystkie linki dostępne na stronie w formacie: tekst linku => adres linku.
    Jeśli link nie ma tekstu, ani tekstu alternatywnego, wyświetl ciąg NO_TEXT_LINK.
+
 4. Wyświetl podsumowanie na końcu listy: 
-   o ile linków łącznie
-   o ile linków typu NO_TEXT_LINK
-   o ile łącz prowadzi do innych stron,
-   o ile łącz w obrębie analizowanej strony.
+   o ile linków łącznie znaleziono
+   o ile łącz prowadzi do innych stron
+   o ile łącz w obrębie analizowanej strony
+
+1 - completed
+2 - completed
+3 - modified & completed
+4 - to be done
+
+
+v0.2
+====
+1. Crawler ma odwiedzać tylko te linki, których jeszcze nie odwiedzał.
+   Obecnie, zdarzają się jeszcze sytuacje, że pojawia się komunikat "The page has been visited. Skipping".
+   Nie powinno to mieć już miejsca, ponieważ jest już kod do weryfikacji linków. Gdzieś jest bug.
+
+2. Dodać opcję - na razie nie pobieraną z linii komend - za pomocą, której ustawia się głębokość skanowania stron.
+   0 - bez ograniczeń
+   1 - skanowanie w obrębie domeny
+   2 - crawler skanuje zadaną domenę oraz całą kolejną domenę zewnętrzną napotkaną jako pierwszą
+   3 - crawler skanuje zadaną domenę i całe 2 domeny zewnętrzne, napotkane jako pierwsze
+   4 - j.w.
+   5 - j.w.
+   Ustalić wartość domyślną. 1?
+
+3. Dodać możliwość czystego zakończenia programu - yawc ma wiedzieć, że user zażądał zakończenia
+   wykonywania programu, wyświetlić stosowny komunikat i zakończyć działanie bez informacji o błędach.
+   (Ctrl+c generuje komunikat o błędzie)
+
+
+More ideas to be implemented:
+1. Start using english only
+2. Add option to scan strictly within a given host
+3. Give multiple adresses to be scanned as cli input
+4. Write unique links to the output file given by the user
+5. Read links to be scanned from file given by user
+6. Start using a documentation tool.
+7. On user request respect robots.txt file when visiting a site.
+
+
+
+
diff --git a/yawc.rb b/yawc.rb
index 9e8e198..fbf9561 100755
--- a/yawc.rb
+++ b/yawc.rb
@@ -2,17 +2,75 @@
 require 'mechanize'
 require 'colorize'
 
+# sets the depth of scanning
+# 0 - no limits
+# 1 - given domain only
+# 2 - the given domain and the whole external domain found as the first during scanning
+# 3 - etc.
 depth = 1
 
+class Array
+  def unique_links()
+    # Clone the given ary
+    links_clone = links.clone
+
+    # Check href field of every link of the cloned ary with all elements in original ary.
+    # If in original ary you find links with the same href field, put the links into another, newly created ary.
+    links_clone.each do |link_clone|
+      links_to_delete = Array.new
+      links.each {|link| links_to_delete << link if link.href == link_clone.href}
+      unless links_to_delete.empty?
+        # Pop one of the collected links (if we don't do it, we'd delete all the links, which have the same href).
+        # If there's not nil in return, go through links_to_delete and delete the links from original links ary.
+        links_to_delete.each {|link_to_delete| links.delete(link_to_delete)} if links_to_delete.pop != nil
+      end
+    end
+    links
+  end
+end
+
+def leave_only_unique_links(links)
+  # Clone the given ary
+  links_clone = links.clone
+  
+  # Check href field of every link of the cloned ary with all elements in original ary.
+  # If in original ary you find links with the same href field, put the links into another, newly created ary.
+  links_clone.each do |link_clone|
+    links_to_delete = Array.new
+    links.each {|link| links_to_delete << link if link.href == link_clone.href}
+    unless links_to_delete.empty?
+      # Pop one of the collected links (if we don't do it, we'd delete all the links, which have the same href).
+      # If there's not nil in return, go through links_to_delete and delete the links from original links ary.
+      links_to_delete.each {|link_to_delete| links.delete(link_to_delete)} if links_to_delete.pop != nil
+    end
+  end
+  links
+end
+
+def check_uniqueness_between_arrays(links, newlinks)
+  new_links = newlinks.clone
+  new_links_to_delete = Array.new
+  new_links.each {|new_link|
+    links.each {|link| new_links_to_delete << new_link if new_link.href == link.href}
+  }
+  new_links_to_delete.each {|link_to_delete| new_links.delete(link_to_delete)} unless new_links_to_delete.empty?
+
+  new_links
+end
+
 if ARGV.empty? == false
   agent = Mechanize.new
   agent.user_agent_alias = 'Linux Mozilla'
 
   page = agent.get(URI.parse(ARGV[0]))
   links = page.links
-  uri_links = links.map {|link| link.href}
-  puts "Found #{links.length} links on #{ARGV[0]}.".green
-  puts "I will try visiting every link now...\n".green
+  puts "Found #{links.count} links on #{ARGV[0]}".green
+  print 'Removing multiple links... '.green
+
+  links = leave_only_unique_links(links)
+
+  puts "#{links.count} links left.".green
+  puts "Start visiting the links...\n".green
 
   counter = 1
   links_visited = 0
@@ -20,21 +78,21 @@
 
   links.each do |link|
     begin
-      if agent.visited?(link.uri) == nil && link.uri.host == URI.parse(ARGV[0]).host
+      if agent.visited?(link.uri) == nil
         print "#{counter}/#{links.count})".blue + " Visiting site: #{link.href}"
         begin
           new_page = link.click
           print '. '
           new_links = new_page.links
-          print "This site includes #{new_links.count.to_s} links."
+          print "This site includes #{new_links.count} links."
+
+          new_unique_links = leave_only_unique_links(new_links)
+          verified_new_links = check_uniqueness_between_arrays(links, new_unique_links)
+          verified_new_links.compact!
+          color = verified_new_links.count == new_links.count ? :yellow : :green
+          puts " Adding #{verified_new_links.count} new links to the links array.".colorize(color)
 
-          new_unique_links = []
-          new_links.each do |new_link|
-            new_unique_links.push(new_link) unless uri_links.include?(new_link.href)
-          end
-          color = new_unique_links.length == new_links.length ? :yellow : :green
-          puts " Adding #{new_unique_links.length} new links to the links array.".colorize(color)
-          new_unique_links.each { |new_unique_link| links << new_unique_link; uri_links << new_unique_link.href} if new_unique_links.length > 0
+          verified_new_links.each {|new_link| links << new_link} if verified_new_links.count > 0
 
         rescue Mechanize::ResponseCodeError => e
           print "#{link.uri} - The page does not respond. Skipping...\n".red