Skip to content
Browse files

Added APSRTC Scraper for Hyderabad bus routes

  • Loading branch information...
1 parent e281eaa commit ceebbc88ffe6c1c6d0e693ba9e579da2f7377609 @justjkk justjkk committed Jun 10, 2011
View
13,972 apsrtc_scraper/TIME TABLE-HCZ_debugged.htm
13,972 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
10,120 apsrtc_scraper/parsed_routes.json
10,120 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
56 apsrtc_scraper/parser.py
@@ -0,0 +1,56 @@
+from BeautifulSoup import BeautifulSoup, NavigableString
+import urllib2
+import simplejson as json
+import sys
+def strip_tags(html, invalid_tags):
+ soup = BeautifulSoup(html)
+ for tag in soup.findAll(True):
+ if tag.name in invalid_tags:
+ s = ""
+ for c in tag.contents:
+ if not isinstance(c, NavigableString):
+ c = strip_tags(unicode(c), invalid_tags)
+ s += unicode(c)
+ tag.replaceWith(s)
+ return soup
+
+def getHtml(url):
+ user_agent = 'Mozilla/5 (Ubuntu 10.04) Gecko'
+ headers = { 'User-Agent' : user_agent }
+ request = urllib2.Request(url, None, headers)
+ response = urllib2.urlopen(request)
+ html = response.read()
+ return html
+
+def parseHtml(html):
+ soup = strip_tags(html, ['span'])
+ trs = soup.findAll('tr')[5:-1]
+ routes = []
+ for tr in trs:
+ tds = [x.contents[0] for x in tr.findAll('td')[:12]]
+ route = {
+ "from_stage" : tds[3].strip(),
+ "to_stage" : tds[4].strip(),
+ "via" : [x.strip() for x in tds[5].split(',')],
+ "frequency_peak" : tds[10],
+ "frequency_slack" : tds[11]
+ }
+ if type(tds[1]).__name__ == 'Tag':
+ part1 = tds[1].contents[0].strip()
+ else:
+ part1 = tds[1].strip()
+ if type(tds[2]).__name__ == 'Tag':
+ part2 = tds[2].contents[0].strip()
+ else:
+ part2 = tds[2].strip()
+ route["route_id"] = part1 + part2
+ routes.append(route)
+ return routes
+
+if __name__ == "__main__":
+ #url = 'http://apsrtc.gov.in/About%20Us/Route-Network/TIME%20TABLE-HCZ.htm'
+ #html = getHtml(url)
+ html = open('TIME TABLE-HCZ_debugged.htm','r').read()
+ routes = parseHtml(html)
+ json.dump(routes, sys.stdout, indent=2)
+
View
1,504 apsrtc_scraper/unique_stops.csv
1,504 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
24 apsrtc_scraper/unique_stops.py
@@ -0,0 +1,24 @@
+import simplejson as json
+import csv
+import sys
+
+parsed_routes = json.load(open('parsed_routes.json', 'r'))
+
+def add_stop(unprocessed_stop):
+ stop = unprocessed_stop.replace("\n","")
+ stop = stop.replace("\r","")
+ stop = stop.replace("&nbsp","")
+ stop = stop.replace(" ", " ")
+ unique_stops.add((unprocessed_stop, stop))
+
+unique_stops = set()
+for route in parsed_routes:
+ add_stop(route["from_stage"])
+ add_stop(route["to_stage"])
+ [add_stop(x) for x in route["via"]]
+
+stops = list(unique_stops)
+stops.sort(key=lambda x: x[0].lower())
+
+writer = csv.writer(sys.stdout)
+writer.writerows(stops)

0 comments on commit ceebbc8

Please sign in to comment.
Something went wrong with that request. Please try again.