Permalink
Browse files

added LICENSE, README. fixed an issue with comma that wasn't loading …

…me output to a CSV file. now parses a single page of entries
  • Loading branch information...
1 parent d3edc6a commit a9b85b098459edb6d7afb3dbe365bb2b162df4e9 @szTheory committed Mar 18, 2011
Showing with 92 additions and 20 deletions.
  1. +1 −1 .gitignore
  2. +6 −0 Gemfile
  3. +14 −0 Gemfile.lock
  4. +19 −0 LICENSE
  5. +1 −0 README.rdoc
  6. +0 −3 TODO
  7. +3 −0 TODO.rdoc
  8. +48 −16 scrape.rb
View
@@ -1,2 +1,2 @@
OUTPUT
-TEST
+OUTPUT.csv
View
@@ -1,6 +1,12 @@
source "http://rubygems.org"
+#automated WWW interaction + parsing
gem "mechanize"
+gem "i18n"
+gem "activesupport", :require => "active_support"
+
+#CSV generator DSL
+gem "comma", :git => "git@github.com:jonathanb/comma.git"
group :test do
gem "rspec"
View
@@ -1,7 +1,18 @@
+GIT
+ remote: git@github.com:jonathanb/comma.git
+ revision: bfa2d38d4ef4b046071a3196dbed8bfa7466b04c
+ specs:
+ comma (0.4.1)
+ activesupport (>= 2.2.2)
+ comma
+ i18n
+
GEM
remote: http://rubygems.org/
specs:
+ activesupport (3.0.5)
diff-lcs (1.1.2)
+ i18n (0.5.0)
mechanize (1.0.0)
nokogiri (>= 1.2.1)
nokogiri (1.4.4)
@@ -18,5 +29,8 @@ PLATFORMS
ruby
DEPENDENCIES
+ activesupport
+ comma!
+ i18n
mechanize
rspec
View
19 LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2011 jonathanb
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
View
@@ -0,0 +1 @@
+MIT license, see the LICENSE file.
View
3 TODO
@@ -1,3 +0,0 @@
-read in entries for one state (PA/NJ/etc)
-divide entries for state into type (installer, developer, etc)
-split up the addresses into state, zipcode, etc
View
@@ -0,0 +1,3 @@
+* read in entries for one state (PA/NJ/etc)
+* divide entries for state into type (installer, developer, etc)
+* split up the addresses into state, zipcode, etc
View
@@ -2,9 +2,31 @@
require "bundler/setup"
Bundler.require(:default)
+# hash wrapper class for the CSV file DSL called "comma"
+class Entry
+ attr_accessor :company, :website, :name, :addr1, :addr2, :addr_rest, :phone, :description
+
+ comma do
+ company
+ website
+ name
+ addr1
+ addr2
+ addr_rest
+ phone
+ description
+ end
+end
+
class Scraper
- BUS_TYPES = ["Contractor/Installer","Distributor","Financial Company or Financial Consultant","Law Firm","Research Laboratory","Manufacturer/Supplier","Utility","Project Developer (Architects, planners, consultants, and builders of solar projects)","Other (non-financial)","Commercial System User","Solar Winery/Brewery/Distillery"]
- STATES = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
+ # form field names
+ FIELDNAME_NAME = "filter_equal.member_type.name"
+ FIELDNAME_STATE = "filter_equal.dm_seia_organization.address.state"
+ FIELDNAME_KEYWORDS = "filter_like.dm_seia_organization.description"
+
+ # form field vals
+ FIELDVALS_BUS_TYPES = ["Contractor/Installer","Distributor","Financial Company or Financial Consultant","Law Firm","Research Laboratory","Manufacturer/Supplier","Utility","Project Developer (Architects, planners, consultants, and builders of solar projects)","Other (non-financial)","Commercial System User","Solar Winery/Brewery/Distillery"]
+ FIELDVALS_STATES = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
SEIA_URL = "http://www.seia.org/cs/membership/member_directory"
@@ -18,36 +40,45 @@ def self.scrape_page
# set form values
form = page.form("search")
- form["filter_equal.member_type.name"] = 'Contractor/Installer'
- form["filter_equal.dm_seia_organization.address.state"] = 'PA'
- form["filter_like.dm_seia_organization.description"] = ''
+ form[FIELDNAME_NAME] = 'Contractor/Installer'
+ form[FIELDNAME_STATE] = 'PA'
+ form[FIELDNAME_KEYWORDS] = ''
# submit form, get results page
result_page = agent.submit(form, form.buttons.first)
# scrape entries from results page
entries = []
- e = {}
+ e = Entry.new
result_page.search('.results p').each do |p|
className = p.attr('class')
+
+ break if className.nil?
+
+ #make friendly for Ruby method names
+ className.gsub!('-', '_')
# 1st/last in sequence for this entry?
is_first = className == 'company'
is_last = className == 'description'
- # puts "#{p.attr('class')} => #{p.text}"
+ #new entry if we just started
+ e = Entry.new if is_first
+
+ #get next tag in sequence
+ clean_txt = Scraper.sanitize_str(p.text)
+ e.send "#{className}=", clean_txt
- e = {} if is_first #new entry if we just started
- e[className] = p.text.chomp #get next tag in sequence
- entries << e if is_last #push the entry if we're done
+ #push the entry if we're done
+ entries << e if is_last
end
- # output each entry
- entries.each_with_index do |e, i|
- e.each_pair do |k, v|
- puts "#{k} => #{v}"
- end
- end
+ # output to CSV file
+ entries.to_comma(:filename => "OUTPUT.csv")
+ end
+
+ def self.sanitize_str(str)
+ str.gsub("\r", '').gsub("\n", '').squeeze(' ').strip
end
# phone number regex (src: http://blog.stevenlevithan.com/archives/validate-phone-number#r4-2-v-inline)
@@ -56,6 +87,7 @@ def self.phone_number?(txt)
end
end
+# RSpec tests
if __FILE__ == $PROGRAM_NAME
Scraper.scrape_page

0 comments on commit a9b85b0

Please sign in to comment.