Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

first commit

  • Loading branch information...
commit 95f6adee8b78254ed0c1a01d56acd5b0b6d6b447 1 parent f316b93
@groovecoder groovecoder authored
Showing with 100 additions and 1 deletion.
  1. +4 −0 .gitignore
  2. +24 −1 README.md
  3. +2 −0  requirements.txt
  4. +70 −0 scrape.py
View
4 .gitignore
@@ -0,0 +1,4 @@
+*.pyc
+*.swp
+cache
+README.html
View
25 README.md
@@ -1,4 +1,27 @@
tccl-tos-scraper
================
-Scraper for Tulsa City-County Library's Tulsa Organizations & Services data
+Scraper for [Tulsa City-County Library's Tulsa Organizations & Services][tccl-tos]
+data
+
+With the objective to load it into [TulsaWiki][] via API
+
+[tccl-tos]: http://opac.tulsalibrary.org:82/
+[TulsaWiki]: http://tulsawiki.org/
+
+Getting Started
+---------------
+
+Make a virtualenv
+
+Install the requirements
+
+ pip install -r requirements.txt
+
+Create a cache/ dir
+
+ mkdir cache
+
+Run it
+
+ python scrape.py
View
2  requirements.txt
@@ -0,0 +1,2 @@
+beautifulsoup4==4.1.3
+requests==1.1.0
View
70 scrape.py
@@ -0,0 +1,70 @@
+import hashlib
+import os
+import time
+
+from bs4 import BeautifulSoup
+import requests
+
+
+TCCL_TOS_HOST = 'http://opac.tulsalibrary.org:82'
+TCCL_TOS_PATH = 'search~S0'
+TCCL_TOS_PARAMS = {
+ 'searchtype': 't',
+ 'searcharg': '',
+ 'SORT': 'D',
+ 'extended': 0,
+ 'SUBMIT': 'Search',
+ 'searchlimits': '',
+ 'searchorigarg': 'ta'
+}
+
+
+def search_letter(letter):
+ search_params = TCCL_TOS_PARAMS.copy()
+ search_params['searcharg'] = letter
+ print "POST"
+ resp = requests.post('%s/%s' % (TCCL_TOS_HOST, TCCL_TOS_PATH),
+ data=search_params)
+ soup = BeautifulSoup(resp.content)
+ scrape_entries(soup)
+
+
+def scrape_entries(soup):
+ entries = soup.find_all('td', 'browseEntryData')
+ for entry in entries:
+ entry_link = entry.find_all('a')[1]
+ copy_entry(entry_link['href'])
+ next_link = soup.find('a', text='Next')
+ if next_link:
+ next_content = cached_get_content(TCCL_TOS_HOST + next_link['href'])
+ next_soup = BeautifulSoup(next_content)
+ scrape_entries(next_soup)
+
+
+def copy_entry(entry_href):
+ pass
+
+
+def cached_get_content(url, cache_timeout=3600):
+ path_hash = hashlib.md5(url).hexdigest()
+ cache_file = 'cache/%s' % path_hash
+
+ content = None
+ if os.path.exists(cache_file) and file_age(cache_file) < cache_timeout:
+ content = open(cache_file, 'r').read()
+
+ if not content:
+ print "GET"
+ resp = requests.get(url)
+ content = resp.content
+ open(cache_file, 'w').write(content)
+
+ return content
+
+
+def file_age(fn):
+ return time.time() - os.stat(fn).st_mtime
+
+
+for letter in 'ab':
+ search_letter(letter)
Please sign in to comment.
Something went wrong with that request. Please try again.