Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

switching to mechanize. get an individual state's results.

  • Loading branch information...
commit aeb804b5b7883d7d1a46244dbc49c69b9a09a6c7 1 parent b234a00
@szTheory authored
Showing with 31 additions and 75 deletions.
  1. +2 −0  .gitignore
  2. +0 −1  Gemfile.lock
  3. +29 −74 scrape.rb
View
2  .gitignore
@@ -0,0 +1,2 @@
+OUTPUT
+TEST
View
1  Gemfile.lock
@@ -19,5 +19,4 @@ PLATFORMS
DEPENDENCIES
mechanize
- nokogiri
rspec
View
103 scrape.rb
@@ -6,54 +6,25 @@ class Scraper
BUS_TYPES = ["Contractor/Installer","Distributor","Financial Company or Financial Consultant","Law Firm","Research Laboratory","Manufacturer/Supplier","Utility","Project Developer (Architects, planners, consultants, and builders of solar projects)","Other (non-financial)","Commercial System User","Solar Winery/Brewery/Distillery"]
STATES = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
+ SEIA_URL = "http://www.seia.org/cs/membership/member_directory"
-
- LINE_IDX_NAME = 0
- LINE_IDX_URL = 1
- LINE_IDX_CONTACT = 2
- LINE_IDX_ADR_1 = 3
- LINE_IDX_ADR_2 = 4
- LINE_IDX_PHONE = 5
- LINE_IDX_DESC = 6
-
- def self.parse_file(filename)
- entries = []
+ def self.scrape_page
+ # setup user agent
+ agent = Mechanize.new
+ agent.user_agent_alias = 'Mac Safari'
- File.open(filename) do |f|
- entry = Scraper.parse_entry(file)
- break if entry.empty?
-
- entries << entry
- end
- end
-
- def self.parse_entry(file)
- line_idx = 0
- done = false
- entry = {}
+ # get the page
+ page = agent.get(SEIA_URL)
- begin
- if line = file.gets.chomp
-
- #classify line and add to the hash based on line index
- case line_idx
- when LINE_IDX_NAME; entry[:name] = line
- when LINE_IDX_URL; entry[:url] = line
- when LINE_IDX_CONTACT; entry[:contact] = line
- when LINE_IDX_ADR_1; entry[:adr_1] = line
- when LINE_IDX_ADR_2; entry[:adr_2] = line
- when LINE_IDX_PHONE; entry[:phone] = line
- raise "#{line} was not a valid phone number" unless Scraper.phone_number?(line)
- when LINE_IDX_DESC; entry[:desc] = line
- done = true
- end
-
- line_idx += 1
- end
-
- end while !done
+ # set form values
+ form = page.form("search")
+ form["filter_equal.member_type.name"] = 'Distributor'
+ form["filter_equal.dm_seia_organization.address.state"] = 'PA'
+ form["filter_like.dm_seia_organization.description"] = ''
- entry
+ # submit form, get results page
+ result_page = agent.submit(form, form.buttons.first)
+ puts result_page.body
end
# phone number regex (src: http://blog.stevenlevithan.com/archives/validate-phone-number#r4-2-v-inline)
@@ -63,34 +34,18 @@ def self.phone_number?(txt)
end
if __FILE__ == $PROGRAM_NAME
- Bundler.require(:test)
-
- describe Scraper, '#parse_entry' do
- before(:each) do
- file = File.open("sample_data/entry")
- @entry = Scraper.parse_entry(file)
- end
-
- it "reads in a sample entry's company name" do
- @entry[:name].should == "Advanced Solar Industries, LLC"
- end
- it "reads in a sample entry's website" do
- @entry[:url].should == "www.advancedsolarindustries.com"
- end
- it "reads in a sample entry's contact" do
- @entry[:contact].should == "Elam Beiler"
- end
- it "reads in a sample entry's address line 1" do
- @entry[:adr_1].should == "3530 W. Newport Road"
- end
- it "reads in a sample entry's address line 2" do
- @entry[:adr_2].should == "Ronks, PA 17542"
- end
- it "reads in a sample entry's phone number" do
- @entry[:phone].should == "717-768-8500"
- end
- it "reads in a sample entry's description" do
- @entry[:desc].should == "Advanced Solar Industries is an installer of grid-tie systems primarily utilizing SunPower brand solar panels. We also have over 15 years of experience installing off-grid solar systems."
- end
- end
+ Scraper.scrape_page
+
+ # Bundler.require(:test)
+ #
+ # describe Scraper, '#scrape_page' do
+ # before(:each) do
+ # file = File.open("sample_data/entry")
+ # @entry = Scraper.parse_entry(file)
+ # end
+ #
+ # it "reads in a sample entry's company name" do
+ # @entry[:name].should == "Advanced Solar Industries, LLC"
+ # end
+ # end
end
Please sign in to comment.
Something went wrong with that request. Please try again.