Skip to content

Commit

Permalink
Merge pull request #5 from vinitkumar/feature/asyncio
Browse files Browse the repository at this point in the history
Following things are done in this PR:
  • Loading branch information
vinitkumar committed Jul 3, 2016
2 parents b8b55ec + 2f8a7d1 commit 3171414
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 51 deletions.
3 changes: 2 additions & 1 deletion __init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__version__ = "0.0.1"
__version__ = "2.0.0"

45 changes: 26 additions & 19 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
#!/usr/bin/python
from __future__ import absolute_import, print_function
import asyncio
import time
import optparse
from linkfetcher import Linkfetcher
from webcrawler import Webcrawler
import logging

logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter(
'%(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

def option_parser():
"""Option Parser to give various options."""
Expand All @@ -13,7 +21,7 @@ def option_parser():
Here in this case it goes till depth of 5 and url is target URL to
start crawling.
'''
version = '0.0.1'
version = "2.0.0"

parser = optparse.OptionParser(usage=usage, version=version)

Expand All @@ -31,15 +39,15 @@ def option_parser():
return opts, args


def getlinks(url):
async def getlinks(url):
"""Get Links from the Linkfetcher class."""
page = Linkfetcher(url)
page.linkfetch()
await page.linkfetch()
for i, url in enumerate(page):
print("%d ==> %s" % (i, url))
return (i, url)


def main():
async def main():
""" Main class."""
opts, args = option_parser()
url = args[0]
Expand All @@ -51,23 +59,22 @@ def main():
depth = opts.depth

sTime = time.time()

print("CRAWLER STARTED:")
print("%s, will crawl upto depth %d" % (url, depth))
print("===============================================================")
webcrawler = Webcrawler(url, depth)
webcrawler.crawl()
print("\n".join(webcrawler.urls))

eTime = time.time()
tTime = eTime - sTime
print("\n")
print("Crawler Statistics")
print("==================")
print("No of links Found: %d" % webcrawler.links)
print("No of follwed: %d" % webcrawler.followed)
print("Time Stats : Found all links after %0.2fs" % tTime)
logger.info("CRAWLER STARTED:")
logger.info("%s, will crawl upto depth %d" % (url, depth))
logger.info("*****RESULTS")
logger.info("\n".join(webcrawler.urls))
logger.info("=" * 100)
logger.info("Crawler Statistics")
logger.info("=" * 100)
logger.info("No of links Found: %d" % webcrawler.links)
logger.info("No of followed: %d" % webcrawler.followed)
logger.info("Time Stats : Found all links after %0.2fs" % tTime)


if __name__ == "__main__":
main()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
69 changes: 38 additions & 31 deletions linkfetcher.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@
#!/usr/bin/python
"""Linkfetcher Class."""
from __future__ import absolute_import
from __future__ import print_function
from bs4 import BeautifulSoup
from cgi import escape
from html import escape
import sys
import asyncio
import urllib.request
import urllib.parse
import six
import logging

logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter(
'%(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

class Linkfetcher(object):
"""Link Fetcher class to abstract the link fetching."""

def __init__(self, url):
""" init function to intiate url and urls array."""
self.url = url
self.urls = []
self.__version__ = "0.0.1"
self.__version__ = "2.0.0"
self.agent = "%s/%s" % (__name__, self.__version__)


def _addHeaders(self, request):
""" Add headers for the crawler"""
def _add_headers(self, request):
request.add_header("User-Agent", self.agent)

def __getitem__(self, x):
Expand All @@ -39,30 +43,33 @@ def open(self):
return None
return (request, handle)

def get_crawled_urls(self, handle, request):
try:
content = six.text_type(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content, "html.parser")
tags = soup('a')
except urllib.request.HTTPError as error:
if error.code == 404:
logger.warning("ERROR: %s -> %s for %s" % (error, error.url, self.url))
else:
logger.warning("ERROR: %s for %s" % (error, self.url))

except urllib.request.URLError as error:
logger.warning("ERROR: %s for %s" % (error, self.url))
raise urllib.request.URLError("URL entered is Incorrect")

for tag in tags:
href = tag.get("href")
if href is not None:
url = urllib.parse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)

def linkfetch(self):
"""Linkfetch function to actually fetch links."""
request, handle = self.open()
self._addHeaders(request)
self._add_headers(request)
if handle:
try:
content = six.text_type(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib.request.HTTPError as error:
self.get_crawled_urls(handle, request)

if error.code == 404:
print("ERROR: %s -> %s" % (error, error.url), file=sys.stderr)
else:
print("ERROR: %s" % error, file=sys.stderr)
tags = []

except urllib.request.URLError as error:
print("ERROR: %s" % error, file=sys.stderr)
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urllib.parse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)

0 comments on commit 3171414

Please sign in to comment.