From 4e69dfc486ad4e9633d2fac7b6bbb62734ad4855 Mon Sep 17 00:00:00 2001 From: Tomasz Babel Date: Mon, 20 Feb 2012 18:57:08 +0100 Subject: [PATCH] Improved check of links uniqueness. --- spec.txt | 50 +++++++++++++++++++++++++++++++--- yawc.rb | 82 +++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 116 insertions(+), 16 deletions(-) diff --git a/spec.txt b/spec.txt index 38a7440..1751df1 100644 --- a/spec.txt +++ b/spec.txt @@ -1,10 +1,52 @@ v0.1 +==== 1. Wczytaj stronę. + 2. Adres strony pobierz z wiersza poleceń (argument podawany przy wywołaniu). + 3. Wyświetl wszystkie linki dostępne na stronie w formacie: tekst linku => adres linku. Jeśli link nie ma tekstu, ani tekstu alternatywnego, wyświetl ciąg NO_TEXT_LINK. + 4. Wyświetl podsumowanie na końcu listy: - o ile linków łącznie - o ile linków typu NO_TEXT_LINK - o ile łącz prowadzi do innych stron, - o ile łącz w obrębie analizowanej strony. + o ile linków łącznie znaleziono + o ile łącz prowadzi do innych stron + o ile łącz w obrębie analizowanej strony + +1 - completed +2 - completed +3 - modified & completed +4 - to be done + + +v0.2 +==== +1. Crawler ma odwiedzać tylko te linki, których jeszcze nie odwiedzał. + Obecnie, zdarzają się jeszcze sytuacje, że pojawia się komunikat "The page has been visited. Skipping". + Nie powinno to mieć już miejsca, ponieważ jest już kod do weryfikacji linków. Gdzieś jest bug. + +2. Dodać opcję - na razie nie pobieraną z linii komend - za pomocą, której ustawia się głębokość skanowania stron. + 0 - bez ograniczeń + 1 - skanowanie w obrębie domeny + 2 - crawler skanuje zadaną domenę oraz całą kolejną domenę zewnętrzną napotkaną jako pierwszą + 3 - crawler skanuje zadaną domenę i całe 2 domeny zewnętrzne, napotkane jako pierwsze + 4 - j.w. + 5 - j.w. + Ustalić wartość domyślną. 1? + +3. Dodać możliwość czystego zakończenia programu - yawc ma wiedzieć, że user zażądał zakończenia + wykonywania programu, wyświetlić stosowny komunikat i zakończyć działanie bez informacji o błędach. + (Ctrl+c generuje komunikat o błędzie) + + +More ideas to be implemented: +1. Start using english only +2. Add option to scan strictly within a given host +3. Give multiple adresses to be scanned as cli input +4. Write unique links to the output file given by the user +5. Read links to be scanned from file given by user +6. Start using a documentation tool. +7. On user request respect robots.txt file when visiting a site. + + + + diff --git a/yawc.rb b/yawc.rb index 9e8e198..fbf9561 100755 --- a/yawc.rb +++ b/yawc.rb @@ -2,17 +2,75 @@ require 'mechanize' require 'colorize' +# sets the depth of scanning +# 0 - no limits +# 1 - given domain only +# 2 - the given domain and the whole external domain found as the first during scanning +# 3 - etc. depth = 1 +class Array + def unique_links() + # Clone the given ary + links_clone = links.clone + + # Check href field of every link of the cloned ary with all elements in original ary. + # If in original ary you find links with the same href field, put the links into another, newly created ary. + links_clone.each do |link_clone| + links_to_delete = Array.new + links.each {|link| links_to_delete << link if link.href == link_clone.href} + unless links_to_delete.empty? + # Pop one of the collected links (if we don't do it, we'd delete all the links, which have the same href). + # If there's not nil in return, go through links_to_delete and delete the links from original links ary. + links_to_delete.each {|link_to_delete| links.delete(link_to_delete)} if links_to_delete.pop != nil + end + end + links + end +end + +def leave_only_unique_links(links) + # Clone the given ary + links_clone = links.clone + + # Check href field of every link of the cloned ary with all elements in original ary. + # If in original ary you find links with the same href field, put the links into another, newly created ary. + links_clone.each do |link_clone| + links_to_delete = Array.new + links.each {|link| links_to_delete << link if link.href == link_clone.href} + unless links_to_delete.empty? + # Pop one of the collected links (if we don't do it, we'd delete all the links, which have the same href). + # If there's not nil in return, go through links_to_delete and delete the links from original links ary. + links_to_delete.each {|link_to_delete| links.delete(link_to_delete)} if links_to_delete.pop != nil + end + end + links +end + +def check_uniqueness_between_arrays(links, newlinks) + new_links = newlinks.clone + new_links_to_delete = Array.new + new_links.each {|new_link| + links.each {|link| new_links_to_delete << new_link if new_link.href == link.href} + } + new_links_to_delete.each {|link_to_delete| new_links.delete(link_to_delete)} unless new_links_to_delete.empty? + + new_links +end + if ARGV.empty? == false agent = Mechanize.new agent.user_agent_alias = 'Linux Mozilla' page = agent.get(URI.parse(ARGV[0])) links = page.links - uri_links = links.map {|link| link.href} - puts "Found #{links.length} links on #{ARGV[0]}.".green - puts "I will try visiting every link now...\n".green + puts "Found #{links.count} links on #{ARGV[0]}".green + print 'Removing multiple links... '.green + + links = leave_only_unique_links(links) + + puts "#{links.count} links left.".green + puts "Start visiting the links...\n".green counter = 1 links_visited = 0 @@ -20,21 +78,21 @@ links.each do |link| begin - if agent.visited?(link.uri) == nil && link.uri.host == URI.parse(ARGV[0]).host + if agent.visited?(link.uri) == nil print "#{counter}/#{links.count})".blue + " Visiting site: #{link.href}" begin new_page = link.click print '. ' new_links = new_page.links - print "This site includes #{new_links.count.to_s} links." + print "This site includes #{new_links.count} links." + + new_unique_links = leave_only_unique_links(new_links) + verified_new_links = check_uniqueness_between_arrays(links, new_unique_links) + verified_new_links.compact! + color = verified_new_links.count == new_links.count ? :yellow : :green + puts " Adding #{verified_new_links.count} new links to the links array.".colorize(color) - new_unique_links = [] - new_links.each do |new_link| - new_unique_links.push(new_link) unless uri_links.include?(new_link.href) - end - color = new_unique_links.length == new_links.length ? :yellow : :green - puts " Adding #{new_unique_links.length} new links to the links array.".colorize(color) - new_unique_links.each { |new_unique_link| links << new_unique_link; uri_links << new_unique_link.href} if new_unique_links.length > 0 + verified_new_links.each {|new_link| links << new_link} if verified_new_links.count > 0 rescue Mechanize::ResponseCodeError => e print "#{link.uri} - The page does not respond. Skipping...\n".red