Hard reset, release v1.0.0

tasooshi · Dec 9, 2022 · b02ada4 · b02ada4
1 parent 4c16082
commit b02ada4
Show file tree

Hide file tree

Showing 25 changed files with 883 additions and 675 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+.pytest_cache
+__pycache__
+.DS_Store
+*.sublime-project
+*.sublime-workspace
+MANIFEST
+dist/
+build/
+.venv
+digslash.egg*
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -1,2 +1,46 @@
 # digslash
-A crawler / sitemap builder and enumeration tool for Web applications
+
+> A site mapping and enumeration tool for Web applications analysis
+
+Usage:
+
+    >>> import asyncio
+    >>> from digslash import sites
+    >>> website = sites.Site('https://example.com', limit=50, deduplicate=False)
+    >>> asyncio.run(website.crawl())
+
+Output example:
+
+    >>> dict(website.results)
+    {
+        'https://example.com/': {
+            'checksum': '17913e89fe23ba03081f5f5b650c29a0',
+            'encoding': 'utf-8',
+            'source': ''
+        },
+        'https://example.com/js/script.js': {
+            'checksum': '4cdad7e5affe29e1347343e126beea09',
+            'encoding': 'ascii',
+            'source': 'https://example.com/'
+        },
+        'https://example.com/pages/about.html': {
+            'checksum': 'fad033d51adc628b17268ce2669543fd',
+            'encoding': 'utf-8',
+            'source': 'https://example.com/'
+        },
+        'https://example.com/pages/contact.html': {
+            'checksum': 'b12c6a2fde381552564eea6a477030f0',
+            'encoding': 'utf-8',
+            'source': 'https://example.com/'
+        },
+        'https://example.com/pages/feedback.html': {
+            'checksum': '9b0482107470956f7b64c833f1ef5e59',
+            'encoding': 'utf-8',
+            'source': 'https://example.com/'
+        },
+        'https://example.com/scripts/feedback.html': {
+            'checksum': 'b92c8a06f3d4a9c22e8c11606bcbd2f7',
+            'encoding': 'utf-8',
+            'source': 'https://example.com/pages/feedback.html'
+        }
+    }
diff --git a/digslash/__init__.py b/digslash/__init__.py
@@ -0,0 +1,9 @@
+import logging
+
+
+NAME = 'digslash'
+VERSION = (1, 0, 0)
+__version__ = '.'.join([str(i) for i in VERSION])
+
+
+logger = logging.getLogger(NAME)
diff --git a/digslash/nodes.py b/digslash/nodes.py
@@ -0,0 +1,84 @@
+import hashlib
+import urllib.parse
+
+import bs4
+
+from digslash import logger
+
+
+class Node:
+
+    FOLLOWED_ELEMENTS_ATTRS = (
+        ('a', 'href'),
+        ('script', 'src'),
+        ('iframe', 'src'),
+        ('form', 'action'),
+    )
+
+    def __init__(self, site, content, encoding='ascii', current=None):
+        self.site = site
+        self.encoding = encoding
+        self.results = set()
+        self._checksum = None
+        if current:
+            self.current = urllib.parse.urlsplit(current)
+        else:
+            self.current = self.site.urlsplit
+        try:
+            self.content = content.decode(self.encoding)
+        except Exception as exc:
+            logger.debug(f'Cannot decode due {current} to {exc}')
+            self.content = None
+        else:
+            self.parser = bs4.BeautifulSoup(content, 'html.parser')
+
+    def extract_links(self):
+        for elem, attr in self.FOLLOWED_ELEMENTS_ATTRS:
+            for ele in self.parser.find_all(elem):
+                try:
+                    self.results.add(ele[attr])
+                except KeyError:
+                    pass
+
+    def links_filter(self):
+        refined = set()
+        for link in self.results:
+            url = urllib.parse.urlparse(link)
+            discard = False
+            if url.scheme not in ['http', 'https'] and url.scheme != '':
+                discard = True
+            for sep in self.site.paths_ignored:
+                if sep in url.path or (not url.netloc and sep in link):
+                    discard = True
+                    break
+            if not discard and (url.netloc == self.current.netloc or not url.netloc):
+                refined.add(link)
+        self.results = refined
+
+    def links_rebase(self):
+        refined = set()
+        for link in self.results:
+            split_url = urllib.parse.urlsplit(link)
+            rebased_url = urllib.parse.urljoin(
+                self.site.base, urllib.parse.SplitResult('', '', split_url.path, split_url.query, split_url.fragment).geturl()
+            )
+            refined.add(rebased_url)
+        self.results = refined
+
+    @property
+    def checksum(self):
+        if self._checksum is None:
+            self._checksum = hashlib.md5(self.content.encode(self.encoding)).hexdigest()
+        return self._checksum
+
+    def process(self):
+        if self.content is None:
+            return
+        self.extract_links()
+        self.links_filter()
+        self.links_rebase()
+        return {
+            'encoding': self.encoding,
+            'checksum': self.checksum,
+            'links': self.results,
+        }
diff --git a/digslash/sites.py b/digslash/sites.py
@@ -0,0 +1,164 @@
+import asyncio
+import urllib.parse
+import ssl
+from collections import defaultdict
+
+import aiohttp
+
+from digslash import (
+    logger,
+    nodes,
+)
+
+
+class Site:
+
+    ACCEPTED_CONTENT_TYPES = (
+        'text/html',
+        'text/xml',
+        'text/plain',
+        'text/javascript',
+        'application/javascript',
+        'application/json',
+        'application/xml',
+        'application/xhtml+xml',
+        'application/octet-stream',
+    )
+
+    PATHS_IGNORED = (
+        ' + ',
+        '\'+',
+        '+\'',
+        '\\\'',
+        '#',
+    )
+
+    def __init__(self,
+        base,
+        deduplicate=True,
+        workers_no=16,
+        limit=100,
+        store_content=False,
+        store_headers=False,
+        accepted_content_types=ACCEPTED_CONTENT_TYPES,
+        ignored_status_codes=None,
+        verify_ssl=True,
+        body_limit=1000000,
+        paths_ignored=PATHS_IGNORED
+    ):
+        self.base = base
+        self.urlsplit = urllib.parse.urlsplit(base)
+        self.deduplicate = deduplicate
+        self.workers_no = workers_no
+        self.limit = limit
+        self.queue = None
+        self.results = defaultdict(dict)
+        self.workers = list()
+        self.checksums_index = dict()
+        self.store_content = store_content
+        self.accepted_content_types = accepted_content_types
+        self.ignored_status_codes = tuple() if not ignored_status_codes else ignored_status_codes
+        self.verify_ssl = verify_ssl
+        self.body_limit = body_limit
+        self.paths_ignored = paths_ignored
+        self.store_headers = store_headers
+        self.ssl_context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+        self.ssl_context.check_hostname = False
+        if not verify_ssl:
+            self.ssl_context.verify_mode = ssl.CERT_NONE
+
+    async def worker(self):
+        while self.is_populated():
+            if self.limit and len(self.results) >= self.limit:
+                self.queue_flush()
+                break
+            url, source = await self.queue.get()
+            logger.info('Processing {}'.format(url))
+            response = await self.fetch(url)
+            if response:
+                body, encoding, headers = response
+                node = nodes.Node(self, body, encoding, current=url)
+                results = node.process()
+                if results:
+                    self.done(url, source, results, body, headers)
+                    self.queue_add(results['links'], url)
+            self.worker_done()
+
+    def worker_done(self):
+        self.queue.task_done()
+
+    def done(self, url, source, results, body, headers):
+        entry = {
+            'source': source,
+            'encoding': results['encoding'],
+            'checksum': results['checksum'],
+        }
+        if self.store_content:
+            entry['body'] = body
+        if self.store_headers:
+            entry['headers'] = headers
+        if self.deduplicate:
+            if results['checksum'] not in self.checksums_index:
+                self.results[url] = entry
+                self.checksums_index[results['checksum']] = url
+        else:
+            self.results[url] = entry
+
+    def queue_flush(self):
+        for _ in range(self.queue.qsize()):
+            self.queue.get_nowait()
+            self.queue.task_done()
+
+    def queue_add(self, url_list, source=''):
+        for url in url_list:
+            if (
+                url not in self.results and
+                url not in dict(self.queue._queue) and  # FIXME: Optimize
+                url != source
+            ):
+                logger.debug('Added to queue: ' + url)
+                self.queue.put_nowait((url, source))
+
+    def is_populated(self):
+        return not self.queue.empty()
+
+    def handle_worker_result(self, worker):
+        try:
+            worker.result()
+        except asyncio.CancelledError:
+            pass
+        except Exception:
+            logger.exception('Exception raised by {}'.format(worker))
+
+    async def crawl(self):
+        self.queue = asyncio.Queue()
+        for i in range(self.workers_no):
+            worker = asyncio.create_task(self.worker())
+            worker.add_done_callback(self.handle_worker_result)
+            self.workers.append(worker)
+        # Initialize queue with the base URL
+        self.queue_add([self.base])
+        await self.queue.join()
+
+    async def stop(self):
+        for worker in self.workers:
+            worker.cancel()
+        await asyncio.gather(*self.workers)
+        self.workers = None
+
+    async def fetch(self, url):
+        async with aiohttp.ClientSession() as session:
+            try:
+                async with session.get(url, ssl_context=self.ssl_context) as response:
+                    body = await response.read()
+                    if response.status not in self.ignored_status_codes:
+                        content_type = response.content_type.split(';')[0]
+                        if response.content_type in self.accepted_content_types:
+                            logger.debug('Received response with Content-Type {} for {}'.format(response.content_type, url))
+                            return body[:self.body_limit], response.get_encoding(), dict(response.headers)
+                        else:
+                            logger.debug('Unsupported Content-Type {}'.format(response.content_type))
+                    else:
+                        logger.debug('Status {}, skip processing'.format(response.status))
+            except Exception as exc:
+                logger.debug('Exception {}, skip processing'.format(exc))
diff --git a/setup.py b/setup.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+from distutils.core import setup
+
+import digslash
+
+
+setup(
+    name=digslash.NAME,
+    version=digslash.__version__,
+    description='A site mapping and enumeration tool for Web applications analysis',
+    author='tasooshi',
+    author_email='tasooshi@pm.me',
+    license='MIT License',
+    url='https://github.com/tasooshi/digslash',
+    packages=['digslash'],
+    install_requires=(
+        'aiohttp==3.8.3',
+        'beautifulsoup4==4.11.1',
+    ),
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Topic :: Utilities',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+    ]
+)
diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+log_cli = true
+log_cli_level = DEBUG