-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
25 changed files
with
883 additions
and
675 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
.pytest_cache | ||
__pycache__ | ||
.DS_Store | ||
*.sublime-project | ||
*.sublime-workspace | ||
MANIFEST | ||
dist/ | ||
build/ | ||
.venv | ||
digslash.egg* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,46 @@ | ||
# digslash | ||
A crawler / sitemap builder and enumeration tool for Web applications | ||
|
||
> A site mapping and enumeration tool for Web applications analysis | ||
Usage: | ||
|
||
>>> import asyncio | ||
>>> from digslash import sites | ||
>>> website = sites.Site('https://example.com', limit=50, deduplicate=False) | ||
>>> asyncio.run(website.crawl()) | ||
|
||
Output example: | ||
|
||
>>> dict(website.results) | ||
{ | ||
'https://example.com/': { | ||
'checksum': '17913e89fe23ba03081f5f5b650c29a0', | ||
'encoding': 'utf-8', | ||
'source': '' | ||
}, | ||
'https://example.com/js/script.js': { | ||
'checksum': '4cdad7e5affe29e1347343e126beea09', | ||
'encoding': 'ascii', | ||
'source': 'https://example.com/' | ||
}, | ||
'https://example.com/pages/about.html': { | ||
'checksum': 'fad033d51adc628b17268ce2669543fd', | ||
'encoding': 'utf-8', | ||
'source': 'https://example.com/' | ||
}, | ||
'https://example.com/pages/contact.html': { | ||
'checksum': 'b12c6a2fde381552564eea6a477030f0', | ||
'encoding': 'utf-8', | ||
'source': 'https://example.com/' | ||
}, | ||
'https://example.com/pages/feedback.html': { | ||
'checksum': '9b0482107470956f7b64c833f1ef5e59', | ||
'encoding': 'utf-8', | ||
'source': 'https://example.com/' | ||
}, | ||
'https://example.com/scripts/feedback.html': { | ||
'checksum': 'b92c8a06f3d4a9c22e8c11606bcbd2f7', | ||
'encoding': 'utf-8', | ||
'source': 'https://example.com/pages/feedback.html' | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import logging | ||
|
||
|
||
NAME = 'digslash' | ||
VERSION = (1, 0, 0) | ||
__version__ = '.'.join([str(i) for i in VERSION]) | ||
|
||
|
||
logger = logging.getLogger(NAME) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import hashlib | ||
import urllib.parse | ||
|
||
import bs4 | ||
|
||
from digslash import logger | ||
|
||
|
||
class Node: | ||
|
||
FOLLOWED_ELEMENTS_ATTRS = ( | ||
('a', 'href'), | ||
('script', 'src'), | ||
('iframe', 'src'), | ||
('form', 'action'), | ||
) | ||
|
||
def __init__(self, site, content, encoding='ascii', current=None): | ||
self.site = site | ||
self.encoding = encoding | ||
self.results = set() | ||
self._checksum = None | ||
if current: | ||
self.current = urllib.parse.urlsplit(current) | ||
else: | ||
self.current = self.site.urlsplit | ||
try: | ||
self.content = content.decode(self.encoding) | ||
except Exception as exc: | ||
logger.debug(f'Cannot decode due {current} to {exc}') | ||
self.content = None | ||
else: | ||
self.parser = bs4.BeautifulSoup(content, 'html.parser') | ||
|
||
def extract_links(self): | ||
for elem, attr in self.FOLLOWED_ELEMENTS_ATTRS: | ||
for ele in self.parser.find_all(elem): | ||
try: | ||
self.results.add(ele[attr]) | ||
except KeyError: | ||
pass | ||
|
||
def links_filter(self): | ||
refined = set() | ||
for link in self.results: | ||
url = urllib.parse.urlparse(link) | ||
discard = False | ||
if url.scheme not in ['http', 'https'] and url.scheme != '': | ||
discard = True | ||
for sep in self.site.paths_ignored: | ||
if sep in url.path or (not url.netloc and sep in link): | ||
discard = True | ||
break | ||
if not discard and (url.netloc == self.current.netloc or not url.netloc): | ||
refined.add(link) | ||
self.results = refined | ||
|
||
def links_rebase(self): | ||
refined = set() | ||
for link in self.results: | ||
split_url = urllib.parse.urlsplit(link) | ||
rebased_url = urllib.parse.urljoin( | ||
self.site.base, urllib.parse.SplitResult('', '', split_url.path, split_url.query, split_url.fragment).geturl() | ||
) | ||
refined.add(rebased_url) | ||
self.results = refined | ||
|
||
@property | ||
def checksum(self): | ||
if self._checksum is None: | ||
self._checksum = hashlib.md5(self.content.encode(self.encoding)).hexdigest() | ||
return self._checksum | ||
|
||
def process(self): | ||
if self.content is None: | ||
return | ||
self.extract_links() | ||
self.links_filter() | ||
self.links_rebase() | ||
return { | ||
'encoding': self.encoding, | ||
'checksum': self.checksum, | ||
'links': self.results, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
import asyncio | ||
import urllib.parse | ||
import ssl | ||
from collections import defaultdict | ||
|
||
import aiohttp | ||
|
||
from digslash import ( | ||
logger, | ||
nodes, | ||
) | ||
|
||
|
||
class Site: | ||
|
||
ACCEPTED_CONTENT_TYPES = ( | ||
'text/html', | ||
'text/xml', | ||
'text/plain', | ||
'text/javascript', | ||
'application/javascript', | ||
'application/json', | ||
'application/xml', | ||
'application/xhtml+xml', | ||
'application/octet-stream', | ||
) | ||
|
||
PATHS_IGNORED = ( | ||
' + ', | ||
'\'+', | ||
'+\'', | ||
'\\\'', | ||
'#', | ||
) | ||
|
||
def __init__(self, | ||
base, | ||
deduplicate=True, | ||
workers_no=16, | ||
limit=100, | ||
store_content=False, | ||
store_headers=False, | ||
accepted_content_types=ACCEPTED_CONTENT_TYPES, | ||
ignored_status_codes=None, | ||
verify_ssl=True, | ||
body_limit=1000000, | ||
paths_ignored=PATHS_IGNORED | ||
): | ||
self.base = base | ||
self.urlsplit = urllib.parse.urlsplit(base) | ||
self.deduplicate = deduplicate | ||
self.workers_no = workers_no | ||
self.limit = limit | ||
self.queue = None | ||
self.results = defaultdict(dict) | ||
self.workers = list() | ||
self.checksums_index = dict() | ||
self.store_content = store_content | ||
self.accepted_content_types = accepted_content_types | ||
self.ignored_status_codes = tuple() if not ignored_status_codes else ignored_status_codes | ||
self.verify_ssl = verify_ssl | ||
self.body_limit = body_limit | ||
self.paths_ignored = paths_ignored | ||
self.store_headers = store_headers | ||
self.ssl_context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) | ||
self.ssl_context.check_hostname = False | ||
if not verify_ssl: | ||
self.ssl_context.verify_mode = ssl.CERT_NONE | ||
|
||
async def worker(self): | ||
while self.is_populated(): | ||
if self.limit and len(self.results) >= self.limit: | ||
self.queue_flush() | ||
break | ||
url, source = await self.queue.get() | ||
logger.info('Processing {}'.format(url)) | ||
response = await self.fetch(url) | ||
if response: | ||
body, encoding, headers = response | ||
node = nodes.Node(self, body, encoding, current=url) | ||
results = node.process() | ||
if results: | ||
self.done(url, source, results, body, headers) | ||
self.queue_add(results['links'], url) | ||
self.worker_done() | ||
|
||
def worker_done(self): | ||
self.queue.task_done() | ||
|
||
def done(self, url, source, results, body, headers): | ||
entry = { | ||
'source': source, | ||
'encoding': results['encoding'], | ||
'checksum': results['checksum'], | ||
} | ||
if self.store_content: | ||
entry['body'] = body | ||
if self.store_headers: | ||
entry['headers'] = headers | ||
if self.deduplicate: | ||
if results['checksum'] not in self.checksums_index: | ||
self.results[url] = entry | ||
self.checksums_index[results['checksum']] = url | ||
else: | ||
self.results[url] = entry | ||
|
||
def queue_flush(self): | ||
for _ in range(self.queue.qsize()): | ||
self.queue.get_nowait() | ||
self.queue.task_done() | ||
|
||
def queue_add(self, url_list, source=''): | ||
for url in url_list: | ||
if ( | ||
url not in self.results and | ||
url not in dict(self.queue._queue) and # FIXME: Optimize | ||
url != source | ||
): | ||
logger.debug('Added to queue: ' + url) | ||
self.queue.put_nowait((url, source)) | ||
|
||
def is_populated(self): | ||
return not self.queue.empty() | ||
|
||
def handle_worker_result(self, worker): | ||
try: | ||
worker.result() | ||
except asyncio.CancelledError: | ||
pass | ||
except Exception: | ||
logger.exception('Exception raised by {}'.format(worker)) | ||
|
||
async def crawl(self): | ||
self.queue = asyncio.Queue() | ||
for i in range(self.workers_no): | ||
worker = asyncio.create_task(self.worker()) | ||
worker.add_done_callback(self.handle_worker_result) | ||
self.workers.append(worker) | ||
# Initialize queue with the base URL | ||
self.queue_add([self.base]) | ||
await self.queue.join() | ||
|
||
async def stop(self): | ||
for worker in self.workers: | ||
worker.cancel() | ||
await asyncio.gather(*self.workers) | ||
self.workers = None | ||
|
||
async def fetch(self, url): | ||
async with aiohttp.ClientSession() as session: | ||
try: | ||
async with session.get(url, ssl_context=self.ssl_context) as response: | ||
body = await response.read() | ||
if response.status not in self.ignored_status_codes: | ||
content_type = response.content_type.split(';')[0] | ||
if response.content_type in self.accepted_content_types: | ||
logger.debug('Received response with Content-Type {} for {}'.format(response.content_type, url)) | ||
return body[:self.body_limit], response.get_encoding(), dict(response.headers) | ||
else: | ||
logger.debug('Unsupported Content-Type {}'.format(response.content_type)) | ||
else: | ||
logger.debug('Status {}, skip processing'.format(response.status)) | ||
except Exception as exc: | ||
logger.debug('Exception {}, skip processing'.format(exc)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env python | ||
|
||
from distutils.core import setup | ||
|
||
import digslash | ||
|
||
|
||
setup( | ||
name=digslash.NAME, | ||
version=digslash.__version__, | ||
description='A site mapping and enumeration tool for Web applications analysis', | ||
author='tasooshi', | ||
author_email='tasooshi@pm.me', | ||
license='MIT License', | ||
url='https://github.com/tasooshi/digslash', | ||
packages=['digslash'], | ||
install_requires=( | ||
'aiohttp==3.8.3', | ||
'beautifulsoup4==4.11.1', | ||
), | ||
classifiers=[ | ||
'Development Status :: 5 - Production/Stable', | ||
'Topic :: Utilities', | ||
'License :: OSI Approved :: MIT License', | ||
'Programming Language :: Python :: 3.9', | ||
'Programming Language :: Python :: 3.10', | ||
'Programming Language :: Python :: 3.11', | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[pytest] | ||
log_cli = true | ||
log_cli_level = DEBUG |
Oops, something went wrong.