Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

removed unused dependencies. now outputs a csv properly (but still ne…

…ed to scape commas within the values themselves)
  • Loading branch information...
commit e70142dd823d8058b97f43c34e030eeca8706ce7 1 parent a9b85b0
@szTheory authored
Showing with 13 additions and 42 deletions.
  1. +0 −5 Gemfile
  2. +0 −14 Gemfile.lock
  3. +13 −23 scrape.rb
View
5 Gemfile
@@ -2,11 +2,6 @@ source "http://rubygems.org"
#automated WWW interaction + parsing
gem "mechanize"
-gem "i18n"
-gem "activesupport", :require => "active_support"
-
-#CSV generator DSL
-gem "comma", :git => "git@github.com:jonathanb/comma.git"
group :test do
gem "rspec"
View
14 Gemfile.lock
@@ -1,18 +1,7 @@
-GIT
- remote: git@github.com:jonathanb/comma.git
- revision: bfa2d38d4ef4b046071a3196dbed8bfa7466b04c
- specs:
- comma (0.4.1)
- activesupport (>= 2.2.2)
- comma
- i18n
-
GEM
remote: http://rubygems.org/
specs:
- activesupport (3.0.5)
diff-lcs (1.1.2)
- i18n (0.5.0)
mechanize (1.0.0)
nokogiri (>= 1.2.1)
nokogiri (1.4.4)
@@ -29,8 +18,5 @@ PLATFORMS
ruby
DEPENDENCIES
- activesupport
- comma!
- i18n
mechanize
rspec
View
36 scrape.rb
@@ -2,22 +2,6 @@
require "bundler/setup"
Bundler.require(:default)
-# hash wrapper class for the CSV file DSL called "comma"
-class Entry
- attr_accessor :company, :website, :name, :addr1, :addr2, :addr_rest, :phone, :description
-
- comma do
- company
- website
- name
- addr1
- addr2
- addr_rest
- phone
- description
- end
-end
-
class Scraper
# form field names
FIELDNAME_NAME = "filter_equal.member_type.name"
@@ -28,8 +12,12 @@ class Scraper
FIELDVALS_BUS_TYPES = ["Contractor/Installer","Distributor","Financial Company or Financial Consultant","Law Firm","Research Laboratory","Manufacturer/Supplier","Utility","Project Developer (Architects, planners, consultants, and builders of solar projects)","Other (non-financial)","Commercial System User","Solar Winery/Brewery/Distillery"]
FIELDVALS_STATES = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
+ CLASS_NAMES = ['company', 'website', 'name', 'addr1', 'addr2', 'addr_rest', 'phone', 'description']
+
SEIA_URL = "http://www.seia.org/cs/membership/member_directory"
+ OUTPUT_FILENAME = "OUTPUT.csv"
+
def self.scrape_page
# setup user agent
agent = Mechanize.new
@@ -49,11 +37,11 @@ def self.scrape_page
# scrape entries from results page
entries = []
- e = Entry.new
+ e = {}
result_page.search('.results p').each do |p|
className = p.attr('class')
- break if className.nil?
+ next if className.nil?
#make friendly for Ruby method names
className.gsub!('-', '_')
@@ -63,18 +51,20 @@ def self.scrape_page
is_last = className == 'description'
#new entry if we just started
- e = Entry.new if is_first
+ e = {} if is_first
#get next tag in sequence
- clean_txt = Scraper.sanitize_str(p.text)
- e.send "#{className}=", clean_txt
+ e[className] = Scraper.sanitize_str(p.text)
#push the entry if we're done
- entries << e if is_last
+ entries << e if is_last
end
# output to CSV file
- entries.to_comma(:filename => "OUTPUT.csv")
+ File.open(OUTPUT_FILENAME, 'w') do |f|
+ f.puts CLASS_NAMES.join(',')
+ entries.each {|e| f.puts e.values.join(',')}
+ end
end
def self.sanitize_str(str)
Please sign in to comment.
Something went wrong with that request. Please try again.