Merge pull request #5 from vinitkumar/feature/asyncio

Following things are done in this PR:
vinitkumar · Jul 3, 2016 · 3171414 · 3171414
2 parents b8b55ec + 2f8a7d1
commit 3171414
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 51 deletions.
diff --git a/__init__.py b/__init__.py
@@ -1 +1,2 @@
-__version__ = "0.0.1"
+__version__ = "2.0.0"
+
diff --git a/crawler.py b/crawler.py
@@ -1,10 +1,18 @@
 #!/usr/bin/python
-from __future__ import absolute_import, print_function
+import asyncio
 import time
 import optparse
 from linkfetcher import Linkfetcher
 from webcrawler import Webcrawler
+import logging
 
+logger = logging.getLogger()
+handler = logging.StreamHandler()
+formatter = logging.Formatter(
+        '%(name)-12s %(levelname)-8s %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.setLevel(logging.DEBUG)
 
 def option_parser():
     """Option Parser to give various options."""
@@ -13,7 +21,7 @@ def option_parser():
                 Here in this case it goes till depth of 5 and url is target URL to
                 start crawling.
             '''
-    version = '0.0.1'
+    version = "2.0.0"
 
     parser = optparse.OptionParser(usage=usage, version=version)
 
@@ -31,15 +39,15 @@ def option_parser():
     return opts, args
 
 
-def getlinks(url):
+async def getlinks(url):
     """Get Links from the Linkfetcher class."""
     page = Linkfetcher(url)
-    page.linkfetch()
+    await page.linkfetch()
     for i, url in enumerate(page):
-        print("%d ==> %s" % (i, url))
+        return (i, url)
 
 
-def main():
+async def main():
     """ Main class."""
     opts, args = option_parser()
     url = args[0]
@@ -51,23 +59,22 @@ def main():
     depth = opts.depth
 
     sTime = time.time()
-
-    print("CRAWLER STARTED:")
-    print("%s, will crawl upto depth %d" % (url, depth))
-    print("===============================================================")
     webcrawler = Webcrawler(url, depth)
     webcrawler.crawl()
-    print("\n".join(webcrawler.urls))
-
     eTime = time.time()
     tTime = eTime - sTime
-    print("\n")
-    print("Crawler Statistics")
-    print("==================")
-    print("No of links Found: %d" % webcrawler.links)
-    print("No of follwed:     %d" % webcrawler.followed)
-    print("Time Stats : Found all links  after %0.2fs" % tTime)
+    logger.info("CRAWLER STARTED:")
+    logger.info("%s, will crawl upto depth %d" % (url, depth))
+    logger.info("*****RESULTS")
+    logger.info("\n".join(webcrawler.urls))
+    logger.info("=" * 100)
+    logger.info("Crawler Statistics")
+    logger.info("=" * 100)
+    logger.info("No of links Found: %d" % webcrawler.links)
+    logger.info("No of followed:     %d" % webcrawler.followed)
+    logger.info("Time Stats : Found all links  after %0.2fs" % tTime)
 
 
 if __name__ == "__main__":
-    main()
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main())
diff --git a/linkfetcher.py b/linkfetcher.py
@@ -1,28 +1,32 @@
 #!/usr/bin/python
 """Linkfetcher Class."""
-from __future__ import absolute_import
-from __future__ import print_function
 from bs4 import BeautifulSoup
-from cgi import escape
+from html import escape
 import sys
+import asyncio
 import urllib.request
 import urllib.parse
 import six
+import logging
 
+logger = logging.getLogger()
+handler = logging.StreamHandler()
+formatter = logging.Formatter(
+        '%(name)-12s %(levelname)-8s %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.setLevel(logging.DEBUG)
 
 class Linkfetcher(object):
     """Link Fetcher class to abstract the link fetching."""
-            
+
     def __init__(self, url):
-        """ init function to intiate url and urls array."""
         self.url = url
         self.urls = []
-        self.__version__ = "0.0.1"
+        self.__version__ = "2.0.0"
         self.agent = "%s/%s" % (__name__, self.__version__)
 
-
-    def _addHeaders(self, request):
-        """ Add headers for the crawler"""
+    def _add_headers(self, request):
         request.add_header("User-Agent", self.agent)
 
     def __getitem__(self, x):
@@ -39,30 +43,33 @@ def open(self):
             return None
         return (request, handle)
 
+    def get_crawled_urls(self, handle, request):
+        try:
+            content = six.text_type(handle.open(request).read(), "utf-8",
+                                    errors="replace")
+            soup = BeautifulSoup(content, "html.parser")
+            tags = soup('a')
+        except urllib.request.HTTPError as error:
+            if error.code == 404:
+                logger.warning("ERROR: %s -> %s for %s" % (error, error.url, self.url))
+            else:
+                logger.warning("ERROR: %s for %s" % (error, self.url))
+
+        except urllib.request.URLError as error:
+            logger.warning("ERROR: %s for %s" % (error, self.url))
+            raise urllib.request.URLError("URL entered is Incorrect")
+
+        for tag in tags:
+            href = tag.get("href")
+            if href is not None:
+                url = urllib.parse.urljoin(self.url, escape(href))
+                if url not in self:
+                    self.urls.append(url)
+
     def linkfetch(self):
-        """Linkfetch function to actually fetch links."""
         request, handle = self.open()
-        self._addHeaders(request)
+        self._add_headers(request)
         if handle:
-            try:
-                content = six.text_type(handle.open(request).read(), "utf-8",
-                                  errors="replace")
-                soup = BeautifulSoup(content)
-                tags = soup('a')
-            except urllib.request.HTTPError as error:
+            self.get_crawled_urls(handle, request)
 
-                if error.code == 404:
-                    print("ERROR: %s -> %s" % (error, error.url), file=sys.stderr)
-                else:
-                    print("ERROR: %s" % error, file=sys.stderr)
-                tags = []
 
-            except urllib.request.URLError as error:
-                print("ERROR: %s" % error, file=sys.stderr)
-                tags = []
-            for tag in tags:
-                href = tag.get("href")
-                if href is not None:
-                    url = urllib.parse.urljoin(self.url, escape(href))
-                    if url not in self:
-                        self.urls.append(url)