Permalink
Browse files

scrapes every state/business type now and outputs to CSV (seems to be…

… done)
  • Loading branch information...
1 parent d1f5eb7 commit a4e8ef3177aba30376e4169700f474e60439e520 @szTheory committed Mar 19, 2011
Showing with 52 additions and 17 deletions.
  1. +2 −1 .gitignore
  2. +50 −16 scrape.rb
View
@@ -1,2 +1,3 @@
OUTPUT
-OUTPUT.csv
+OUTPUT.csv
+OUTPUT.csv.BAK
View
@@ -5,15 +5,15 @@
class Scraper
# form field names
- FIELDNAME_NAME = "filter_equal.member_type.name"
+ FIELDNAME_BUS_TYPE = "filter_equal.member_type.name"
FIELDNAME_STATE = "filter_equal.dm_seia_organization.address.state"
FIELDNAME_KEYWORDS = "filter_like.dm_seia_organization.description"
# form field vals
FIELDVALS_BUS_TYPES = ["Contractor/Installer","Distributor","Financial Company or Financial Consultant","Law Firm","Research Laboratory","Manufacturer/Supplier","Utility","Project Developer (Architects, planners, consultants, and builders of solar projects)","Other (non-financial)","Commercial System User","Solar Winery/Brewery/Distillery"]
FIELDVALS_STATES = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
- CLASS_NAMES = ['company', 'website', 'name', 'addr1', 'addr2', 'addr_rest', 'phone', 'description']
+ CLASS_NAMES = ['company', 'website', 'name', 'addr1', 'addr2', 'addr_rest', 'phone', 'description', 'state', 'bus_type']
SEIA_URL = "http://www.seia.org/cs/membership/member_directory"
@@ -23,23 +23,59 @@ def self.scrape_page
# setup user agent
agent = Mechanize.new
agent.user_agent_alias = 'Mac Safari'
+
+ entries = []
+
+ # every state and business type
+ FIELDVALS_STATES.each do |state|
+ FIELDVALS_BUS_TYPES.each do |bus_type|
+
+ # get the results page
+ puts "+++++++++++++++++++++++++++++++++++++++++"
+ puts "GETTING RESULTS FOR #{state}, #{bus_type}"
+ results_page = form_page_results(agent, state, bus_type)
+
+ # parse entries from page
+ puts "PARSING RESULTS..."
+ page_entries = Scraper.parse_results_page(results_page, state, bus_type)
+
+ # concat result entries
+ entries.concat(page_entries)
+ end
+ end
+ Scraper.write_entries_to_csv(entries)
+ end
+
+ def self.form_page_results(agent, state, bus_type)
# get the page
page = agent.get(SEIA_URL)
-
+
# set form values
form = page.form("search")
- form[FIELDNAME_NAME] = 'Contractor/Installer'
- form[FIELDNAME_STATE] = 'PA'
+ form[FIELDNAME_BUS_TYPE] = bus_type
+ form[FIELDNAME_STATE] = state
form[FIELDNAME_KEYWORDS] = ''
# submit form, get results page
- result_page = agent.submit(form, form.buttons.first)
-
+ results_page = agent.submit(form, form.buttons.first)
+ end
+
+ def self.write_entries_to_csv(entries)
+ # output to CSV file
+ CSV.open(OUTPUT_FILENAME, "wb") do |csv|
+ csv << CLASS_NAMES
+ entries.each do |e|
+ csv << CLASS_NAMES.map {|c| e[c]}
+ end
+ end
+ end
+
+ def self.parse_results_page(page, state, bus_type)
# scrape entries from results page
entries = []
e = {}
- result_page.search('.results p').each do |p|
+ page.search('.results p').each do |p|
className = p.attr('class')
next if className.nil?
@@ -58,16 +94,14 @@ def self.scrape_page
e[className] = Scraper.sanitize_str(p.text)
#push the entry if we're done
- entries << e if is_last
- end
-
- # output to CSV file
- CSV.open(OUTPUT_FILENAME, "wb") do |csv|
- csv << CLASS_NAMES
- entries.each do |e|
- csv << CLASS_NAMES.map {|c| e[c]}
+ if is_last
+ e['state'] = state
+ e['bus_type'] = bus_type
+ entries << e
end
end
+
+ entries
end
def self.sanitize_str(str)

0 comments on commit a4e8ef3

Please sign in to comment.