Skip to content

Commit

Permalink
Initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
tasooshi committed May 27, 2022
1 parent dcb2131 commit 21fca0f
Show file tree
Hide file tree
Showing 5 changed files with 295 additions and 1 deletion.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.venv
__pycache__
*.pyc
downloads/
workers/
*.egg-info
.DS_Store
build/
51 changes: 50 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,51 @@
# torboost
Download utility for Tor

> Download utility for Tor
## About

This tool was designed specifically for downloading big files from onion services for analysis. It does so by retrieving chunks using multiple circuits, so the server must support byte ranges (`Accept-Ranges`, most of them do). If a given part fails (connection issues, chunk smaller than expected), it is being put back to the queue. Once all bits and pieces are ready they are combined, and the final result is saved in the `./downloads` directory.

## Warning

**This way of utilizing Tor network reduces your anonymity!**

## Installation

`$ pip install torboost`

## Usage

You may need to wait a while until all circuits are established:

`$ torboost -u 'http://example.onion/data.zip'`

If you want to combine the files before download is finished:

`$ torboost --combine -u 'http://example.onion/data.zip'`

### Arguments

```
usage: torboost [-h] -u URL [-p TOR_PROCESSES] [--control-port-start CONTROL_PORT_START] [--socks-port-start SOCKS_PORT_START] [--timeout TIMEOUT] [--chunk-size CHUNK_SIZE] [--user-agent USER_AGENT] [--debug]
[--combine]
Utility for downloading files from onion services using multiple Tor circuits
options:
-h, --help show this help message and exit
-u URL, --url URL Download URL (default: None)
-p TOR_PROCESSES, --tor-processes TOR_PROCESSES
Number of Tor processes (default: 5)
--control-port-start CONTROL_PORT_START
Start port for Tor control (default: 10080)
--socks-port-start SOCKS_PORT_START
Start port for SOCKS (default: 9080)
--timeout TIMEOUT Timeout for Tor relay connection (default: 300)
--chunk-size CHUNK_SIZE
Size of a single download block (in bytes) (default: 50000000)
--user-agent USER_AGENT
User-Agent header (default: python-requests/2.27.1)
--debug Enable debugging mode (verbose output) (default: INFO)
--combine Combine all chunks downloaded so far (default: False)
```
47 changes: 47 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
#######################################################################
# License: MIT License #
# Homepage: https://github.com/tasooshi/torboost/ #
# Version: 0.9.0 #
#######################################################################

import setuptools


with open('README.md') as f:
long_description = f.read()


setuptools.setup(
name='torboost',
version='0.9.0',
author='tasooshi',
author_email='tasooshi@pm.me',
description='Download utility for Tor',
license='MIT License',
keywords=[
'Tor',
'onion',
'download',
],
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/tasooshi/torboost/',
packages=setuptools.find_packages(),
install_requires=(
'requests[socks]==2.27.1',
'stem==1.8.0',
),
entry_points={
'console_scripts': (
'torboost=torboost.torboost:entry_point',
),
},
classifiers=[
'Development Status :: 4 - Beta',
'Topic :: Utilities',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
]
)
Empty file added torboost/__init__.py
Empty file.
190 changes: 190 additions & 0 deletions torboost/torboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
from urllib import parse
import argparse
import hashlib
import logging
import pathlib
import queue
import requests
import shutil
import stem.process
import threading
import urllib3.exceptions


__version__ = '0.9.0'


logging.basicConfig(format='%(name)s %(levelname)s [%(asctime)s] %(message)s', level=logging.INFO)
logger = logging.getLogger('torboost')


class TorBoost:

WORKERS_DIR = 'workers'
WORKER_PREFIX = 'torboost-'
DOWNLOADS_DIR = 'downloads'

def __init__(self, args):
self.args = args
self.procs = dict()
self.content_size = None
self.queue = queue.Queue()
self.workers = list()
self.url_hash = hashlib.sha256(self.args.url.encode('ascii')).hexdigest()
self.output_dir = pathlib.Path(self.DOWNLOADS_DIR, self.url_hash)
pathlib.Path(self.WORKERS_DIR).mkdir(parents=True, exist_ok=True)

def print_bootstrap(self, line):
if 'Bootstrapped ' in line:
logger.info(line)

def request(self, headers, socks_port):
headers.update({
'User-Agent': self.args.user_agent
})
proxies = {
'http': f'socks5h://localhost:{socks_port}',
'https': f'socks5h://localhost:{socks_port}',
}
return requests.get(self.args.url, headers=headers, proxies=proxies, stream=True)

def worker(self):
name = threading.current_thread().name
while True:
chunk, proc_no = self.queue.get()
output = self.output_dir / f'{chunk[0]}-{chunk[1]}.chunk'
expected_size = chunk[1] - chunk[0] + 1
if output.is_file() and output.stat().st_size == expected_size:
output_size = output.stat().st_size
logger.debug(f'Chunk {chunk} already exists (size: {output_size}), skipping')
self.queue.task_done()
continue
proc_config = self.procs[proc_no]
socks_port = proc_config['SocksPort']
headers = {'Range': f'bytes={chunk[0]}-{chunk[1]}'}
logger.debug(f'Worker [{name}] is requesting chunk {chunk}')
failed = False
try:
response = self.request(headers, socks_port)
logger.debug(f'Response headers for {chunk}: {response.headers}')
except requests.exceptions.ConnectionError:
logger.debug(f'Worker [{name}] failed on connect, putting chunk {chunk} back to the queue')
failed = True
else:
logger.info(f'Worker [{name}] is downloading chunk {chunk}')
try:
with open(str(output), 'wb') as fil:
fil.write(response.raw.read())
output_size = output.stat().st_size
if output_size != expected_size:
logger.debug(f'Invalid chunk size ({output_size}, expected: {expected_size}) for {chunk}, putting it back to the queue')
failed = True
except urllib3.exceptions.ProtocolError:
logger.debug(f'Worker [{name}] failed on read, putting chunk {chunk} back to the queue')
failed = True
else:
if not failed:
logger.info(f'Worker [{name}] saved chunk {chunk}')
if failed:
self.queue.put((chunk, int(name)))
self.queue.task_done()

def tor_proc(self, proc_no):
data_dir = pathlib.Path(self.WORKERS_DIR, self.WORKER_PREFIX + str(proc_no))
socks_port = self.args.socks_port_start + proc_no
control_port = self.args.control_port_start + proc_no
config = {
'SocksPort': str(socks_port),
'ControlPort': str(control_port),
'DataDirectory': str(data_dir),
}
logger.info(f'Bootstrapping Tor process {proc_no}')
logger.debug(f'with config: {config}')
proc = stem.process.launch_tor_with_config(
take_ownership=True,
config = config,
timeout=self.args.timeout,
init_msg_handler = self.print_bootstrap,
)
self.procs[proc_no] = config
self.procs[proc_no]['process'] = proc

def combine(self):
logger.info(f'Combining...')
files = sorted([
fil for fil in pathlib.os.listdir(self.output_dir) if fil.endswith('.chunk')
], key=lambda x: int(x.split('-')[0]))
orig_name = pathlib.posixpath.basename(
parse.unquote(parse.urlparse(self.args.url).path)
)
output_path = pathlib.Path(self.DOWNLOADS_DIR, orig_name)
if output_path.exists():
output_path.unlink()
with open(output_path, 'ab') as dest_file:
for fil in files:
with open(self.output_dir / fil, 'rb') as inp:
shutil.copyfileobj(inp, dest_file)
logger.info(f'Saved: {orig_name} to {self.DOWNLOADS_DIR}')

def connect(self):
for proc_no in range(self.args.tor_processes):
self.tor_proc(proc_no)

def start(self):
if not self.content_size:
raise RuntimeError('TorBoost.content_size must be set first!')
self.output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f'Chunks are being saved in {self.output_dir}')

chunks = list()
chunk_no = int(self.content_size / self.args.chunk_size)
for i in range(0, chunk_no):
chunk_start = i * self.args.chunk_size
chunk_end = chunk_start + self.args.chunk_size
chunks.append((chunk_start, chunk_end - 1))
chunks.append((chunk_end, self.content_size - 1))

for idx, chunk in enumerate(chunks, start=1):
self.queue.put((chunk, (self.args.tor_processes - 1) % idx))

# NOTE: Same number of workers as there are Tor sockets
for worker in range(self.args.tor_processes):
thread = threading.Thread(name=str(worker), target=self.worker, daemon=True)
thread.start()
self.workers.append(thread)
self.queue.join()
self.combine()


def entry_point():
default_headers = requests.utils.default_headers()
parser = argparse.ArgumentParser(
description='Utility for downloading files from onion services using multiple Tor circuits',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('-u', '--url', required=True, help='Download URL')
parser.add_argument('-p', '--tor-processes', default=5, type=int, help='Number of Tor processes')
parser.add_argument('--control-port-start', default=10080, help='First port for Tor control')
parser.add_argument('--socks-port-start', default=9080, help='First port for SOCKS')
parser.add_argument('--timeout', default=300, help='Timeout for Tor relay connection')
parser.add_argument('--chunk-size', default=50000000, help='Size of a single download block (in bytes)')
parser.add_argument('--user-agent', default=default_headers['User-Agent'], help='User-Agent header')
parser.add_argument('--debug', action='store_const', dest='loglevel', const=logging.DEBUG, default=logging.INFO, help='Enable debugging mode (verbose output)')
parser.add_argument('--combine', action='store_true', help='Combine all chunks downloaded so far')
args = parser.parse_args()
logger.setLevel(args.loglevel)
boost = TorBoost(args)
if args.combine:
boost.combine()
exit()
boost.connect()
# NOTE: Do NOT use HEAD here, leads to inconsistent results
response = boost.request({'Accept-Encoding': 'identity'}, boost.procs[0]['SocksPort'])
logger.debug(f'Initial response headers: {response.headers}')
boost.content_size = int(response.headers['Content-Length'])
logger.info(f'Download size: {boost.content_size}')
boost.start()


if __name__ == '__main__':
entry_point()

0 comments on commit 21fca0f

Please sign in to comment.