Skip to content

Commit

Permalink
Hard reset, release v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
tasooshi committed Dec 9, 2022
1 parent 4c16082 commit b02ada4
Show file tree
Hide file tree
Showing 25 changed files with 883 additions and 675 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.pytest_cache
__pycache__
.DS_Store
*.sublime-project
*.sublime-workspace
MANIFEST
dist/
build/
.venv
digslash.egg*
695 changes: 21 additions & 674 deletions LICENSE

Large diffs are not rendered by default.

46 changes: 45 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,46 @@
# digslash
A crawler / sitemap builder and enumeration tool for Web applications

> A site mapping and enumeration tool for Web applications analysis
Usage:

>>> import asyncio
>>> from digslash import sites
>>> website = sites.Site('https://example.com', limit=50, deduplicate=False)
>>> asyncio.run(website.crawl())

Output example:

>>> dict(website.results)
{
'https://example.com/': {
'checksum': '17913e89fe23ba03081f5f5b650c29a0',
'encoding': 'utf-8',
'source': ''
},
'https://example.com/js/script.js': {
'checksum': '4cdad7e5affe29e1347343e126beea09',
'encoding': 'ascii',
'source': 'https://example.com/'
},
'https://example.com/pages/about.html': {
'checksum': 'fad033d51adc628b17268ce2669543fd',
'encoding': 'utf-8',
'source': 'https://example.com/'
},
'https://example.com/pages/contact.html': {
'checksum': 'b12c6a2fde381552564eea6a477030f0',
'encoding': 'utf-8',
'source': 'https://example.com/'
},
'https://example.com/pages/feedback.html': {
'checksum': '9b0482107470956f7b64c833f1ef5e59',
'encoding': 'utf-8',
'source': 'https://example.com/'
},
'https://example.com/scripts/feedback.html': {
'checksum': 'b92c8a06f3d4a9c22e8c11606bcbd2f7',
'encoding': 'utf-8',
'source': 'https://example.com/pages/feedback.html'
}
}
9 changes: 9 additions & 0 deletions digslash/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import logging


NAME = 'digslash'
VERSION = (1, 0, 0)
__version__ = '.'.join([str(i) for i in VERSION])


logger = logging.getLogger(NAME)
84 changes: 84 additions & 0 deletions digslash/nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import hashlib
import urllib.parse

import bs4

from digslash import logger


class Node:

FOLLOWED_ELEMENTS_ATTRS = (
('a', 'href'),
('script', 'src'),
('iframe', 'src'),
('form', 'action'),
)

def __init__(self, site, content, encoding='ascii', current=None):
self.site = site
self.encoding = encoding
self.results = set()
self._checksum = None
if current:
self.current = urllib.parse.urlsplit(current)
else:
self.current = self.site.urlsplit
try:
self.content = content.decode(self.encoding)
except Exception as exc:
logger.debug(f'Cannot decode due {current} to {exc}')
self.content = None
else:
self.parser = bs4.BeautifulSoup(content, 'html.parser')

def extract_links(self):
for elem, attr in self.FOLLOWED_ELEMENTS_ATTRS:
for ele in self.parser.find_all(elem):
try:
self.results.add(ele[attr])
except KeyError:
pass

def links_filter(self):
refined = set()
for link in self.results:
url = urllib.parse.urlparse(link)
discard = False
if url.scheme not in ['http', 'https'] and url.scheme != '':
discard = True
for sep in self.site.paths_ignored:
if sep in url.path or (not url.netloc and sep in link):
discard = True
break
if not discard and (url.netloc == self.current.netloc or not url.netloc):
refined.add(link)
self.results = refined

def links_rebase(self):
refined = set()
for link in self.results:
split_url = urllib.parse.urlsplit(link)
rebased_url = urllib.parse.urljoin(
self.site.base, urllib.parse.SplitResult('', '', split_url.path, split_url.query, split_url.fragment).geturl()
)
refined.add(rebased_url)
self.results = refined

@property
def checksum(self):
if self._checksum is None:
self._checksum = hashlib.md5(self.content.encode(self.encoding)).hexdigest()
return self._checksum

def process(self):
if self.content is None:
return
self.extract_links()
self.links_filter()
self.links_rebase()
return {
'encoding': self.encoding,
'checksum': self.checksum,
'links': self.results,
}
164 changes: 164 additions & 0 deletions digslash/sites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import asyncio
import urllib.parse
import ssl
from collections import defaultdict

import aiohttp

from digslash import (
logger,
nodes,
)


class Site:

ACCEPTED_CONTENT_TYPES = (
'text/html',
'text/xml',
'text/plain',
'text/javascript',
'application/javascript',
'application/json',
'application/xml',
'application/xhtml+xml',
'application/octet-stream',
)

PATHS_IGNORED = (
' + ',
'\'+',
'+\'',
'\\\'',
'#',
)

def __init__(self,
base,
deduplicate=True,
workers_no=16,
limit=100,
store_content=False,
store_headers=False,
accepted_content_types=ACCEPTED_CONTENT_TYPES,
ignored_status_codes=None,
verify_ssl=True,
body_limit=1000000,
paths_ignored=PATHS_IGNORED
):
self.base = base
self.urlsplit = urllib.parse.urlsplit(base)
self.deduplicate = deduplicate
self.workers_no = workers_no
self.limit = limit
self.queue = None
self.results = defaultdict(dict)
self.workers = list()
self.checksums_index = dict()
self.store_content = store_content
self.accepted_content_types = accepted_content_types
self.ignored_status_codes = tuple() if not ignored_status_codes else ignored_status_codes
self.verify_ssl = verify_ssl
self.body_limit = body_limit
self.paths_ignored = paths_ignored
self.store_headers = store_headers
self.ssl_context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
self.ssl_context.check_hostname = False
if not verify_ssl:
self.ssl_context.verify_mode = ssl.CERT_NONE

async def worker(self):
while self.is_populated():
if self.limit and len(self.results) >= self.limit:
self.queue_flush()
break
url, source = await self.queue.get()
logger.info('Processing {}'.format(url))
response = await self.fetch(url)
if response:
body, encoding, headers = response
node = nodes.Node(self, body, encoding, current=url)
results = node.process()
if results:
self.done(url, source, results, body, headers)
self.queue_add(results['links'], url)
self.worker_done()

def worker_done(self):
self.queue.task_done()

def done(self, url, source, results, body, headers):
entry = {
'source': source,
'encoding': results['encoding'],
'checksum': results['checksum'],
}
if self.store_content:
entry['body'] = body
if self.store_headers:
entry['headers'] = headers
if self.deduplicate:
if results['checksum'] not in self.checksums_index:
self.results[url] = entry
self.checksums_index[results['checksum']] = url
else:
self.results[url] = entry

def queue_flush(self):
for _ in range(self.queue.qsize()):
self.queue.get_nowait()
self.queue.task_done()

def queue_add(self, url_list, source=''):
for url in url_list:
if (
url not in self.results and
url not in dict(self.queue._queue) and # FIXME: Optimize
url != source
):
logger.debug('Added to queue: ' + url)
self.queue.put_nowait((url, source))

def is_populated(self):
return not self.queue.empty()

def handle_worker_result(self, worker):
try:
worker.result()
except asyncio.CancelledError:
pass
except Exception:
logger.exception('Exception raised by {}'.format(worker))

async def crawl(self):
self.queue = asyncio.Queue()
for i in range(self.workers_no):
worker = asyncio.create_task(self.worker())
worker.add_done_callback(self.handle_worker_result)
self.workers.append(worker)
# Initialize queue with the base URL
self.queue_add([self.base])
await self.queue.join()

async def stop(self):
for worker in self.workers:
worker.cancel()
await asyncio.gather(*self.workers)
self.workers = None

async def fetch(self, url):
async with aiohttp.ClientSession() as session:
try:
async with session.get(url, ssl_context=self.ssl_context) as response:
body = await response.read()
if response.status not in self.ignored_status_codes:
content_type = response.content_type.split(';')[0]
if response.content_type in self.accepted_content_types:
logger.debug('Received response with Content-Type {} for {}'.format(response.content_type, url))
return body[:self.body_limit], response.get_encoding(), dict(response.headers)
else:
logger.debug('Unsupported Content-Type {}'.format(response.content_type))
else:
logger.debug('Status {}, skip processing'.format(response.status))
except Exception as exc:
logger.debug('Exception {}, skip processing'.format(exc))
29 changes: 29 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python

from distutils.core import setup

import digslash


setup(
name=digslash.NAME,
version=digslash.__version__,
description='A site mapping and enumeration tool for Web applications analysis',
author='tasooshi',
author_email='tasooshi@pm.me',
license='MIT License',
url='https://github.com/tasooshi/digslash',
packages=['digslash'],
install_requires=(
'aiohttp==3.8.3',
'beautifulsoup4==4.11.1',
),
classifiers=[
'Development Status :: 5 - Production/Stable',
'Topic :: Utilities',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
]
)
3 changes: 3 additions & 0 deletions tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
log_cli = true
log_cli_level = DEBUG
Loading

0 comments on commit b02ada4

Please sign in to comment.