Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add nixos-images #29

Merged
merged 5 commits into from Mar 27, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
248 changes: 248 additions & 0 deletions nixos-images.py
@@ -0,0 +1,248 @@
#!/usr/bin/env python3
import hashlib
import logging
import lzma
import os
import re
import sys
import requests
import subprocess

from pyquery import PyQuery as pq
from datetime import datetime, timedelta
from pathlib import Path

from urllib3.util.retry import Retry

UPSTREAM_URL = os.getenv('TUNASYNC_UPSTREAM_URL', 'https://nixos.org/channels')
WORKING_DIR = os.getenv('TUNASYNC_WORKING_DIR', 'working-images')
CLONE_SINCE = datetime(2018, 12, 1)
TIMEOUT = 60

working_dir = Path(WORKING_DIR)

session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
retry_adapter = requests.adapters.HTTPAdapter(max_retries=retries)
session.mount('http://', retry_adapter)
session.mount('https://', retry_adapter)

logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] %(levelname)-8s %(message)s'
)

def http_head(*args, **kwargs):
return session.head(*args, timeout=TIMEOUT, **kwargs)

def http_get(*args, **kwargs):
return session.get(*args, timeout=TIMEOUT, **kwargs)

def file_sha256(dest):
sha = subprocess.check_output(
[ 'sha256sum', str(dest) ],
universal_newlines=True
)
return sha.split(' ')[0]

def atomic_write_file(dest, contents):
tmp_dest = dest.parent / f'.{dest.name}.tmp'
with tmp_dest.open('w') as f:
f.write(contents)
tmp_dest.rename(dest)

class WrongSize(RuntimeError):
def __init__(self, expected, actual):
super().__init__(f'Wrong file size: expected {expected}, actual {actual}')
self.actual = actual
self.expected = expected

def download(url, dest):
dest.parent.mkdir(parents=True, exist_ok=True)
download_dest = dest.parent / f'.{dest.name}.tmp'

retry = retries

while True:
with http_get(url, stream=True) as res:
res.raise_for_status()
try:
with download_dest.open('wb') as f:
for chunk in res.iter_content(chunk_size=64 * 1024 * 1024):
if chunk:
f.write(chunk)
actual_size = download_dest.stat().st_size
if 'Content-Length' in res.headers:
expected_size = int(res.headers['Content-Length'])
if actual_size != expected_size:
raise WrongSize(expected=expected_size, actual=actual_size)

break
except (requests.exceptions.ConnectionError, WrongSize) as e:
logging.warn(e)
next_retry = retry.increment(
method='GET',
url=url,
error=e
)
if next_retry is None:
raise e
else:
retry = next_retry
logging.warn(f'Retrying download: {retry}')

download_dest.rename(dest)

def get_links(url):
r = http_get(url)
r.raise_for_status()

node = pq(r.content)

links = []
for row in node('tr'):
td = pq(row)('td')
if len(td) != 5:
continue

link_target = td[1].find('a').get('href')
if link_target.startswith('/'):
# Link to parent directory
continue

last_updated = td[2].text.strip()

links.append((link_target, last_updated))

return links

def get_channel(chan_location):
release_res = http_get(chan_location)
release_res.raise_for_status()

node = pq(release_res.text)

tagline = node('p').text()

tagline_res = re.match(r'^Released on (.+) from', tagline)

assert tagline_res is not None

released_time = tagline_res[1]

files = []

for row in node('tr'):
td = pq(row)('td')
if len(td) != 3:
continue
file_name, file_size, file_hash = (pq(x).text() for x in td)
files.append((file_name, file_size, file_hash))

return {
'released_time': released_time,
'files': files
}

def clone_images():
for channel, chan_updated in get_links(f'{UPSTREAM_URL}/'):
if not channel.startswith('nixos-') \
or channel.endswith('-small') \
or channel == 'nixos-unstable':
continue

if datetime.strptime(chan_updated, '%Y-%m-%d %H:%M') < CLONE_SINCE:
continue

chan_path = working_dir / channel
chan_path.mkdir(parents=True, exist_ok=True)

res = http_head(f'{UPSTREAM_URL}/{channel}', allow_redirects=False)
res.raise_for_status()

chan_location = res.headers['Location']
chan_release_basename = chan_location.split('/')[-1]

try:
last_url = (chan_path / '.last-url').read_text()
except (IOError, OSError):
last_url = 'not available'

if chan_location == last_url:
continue

logging.info(f'- {channel} -> {chan_release_basename}')

# Matches nixos-19.03 -> nixos-19.03beta171840.23fd1394dc6
# ^-------------^
if chan_release_basename.startswith(channel + 'beta'):
logging.info(f' - Beta channel, not updating')
continue

chan_info = get_channel(chan_location)

atomic_write_file(chan_path / '.released-time', chan_info['released_time'])

has_hash_fail = False

keep_files = { '.last-url', '.released-time' }
rename_files = []

logging.info(f' - Downloading new files')

chan_version = channel.split('-', 1)[1]

chan_release_version = chan_release_basename.split('-', 1)[1]

simplify_name = lambda fname: fname.replace(f'-{chan_release_version}-', f'-{chan_version}-')

image_files = [
(simplify_name(file_name), file_name, file_hash)
for file_name, _file_size, file_hash in chan_info['files']
if file_name.endswith('.iso') or file_name.endswith('ova')
]

for mirror_file_name, upstream_file_name, file_hash in image_files:
keep_files.add(mirror_file_name)
logging.info(f' - {upstream_file_name} -> {mirror_file_name}')
tmp_dest = f'.update.{upstream_file_name}'
rename_files.append((tmp_dest, mirror_file_name))

download(f'{chan_location}/{upstream_file_name}', chan_path / tmp_dest)
actual_hash = file_sha256(chan_path / tmp_dest)

if file_hash != actual_hash:
has_hash_fail = True
logging.error(f' - Incorrect hash')
logging.error(f' actual {actual_hash}')
logging.error(f' expected {file_hash}')
logging.info(f' - File saved as {tmp_dest}')

if has_hash_fail:
logging.warn(f' - Found bad files. Will retry next time.')
else:
logging.info(f' - Renaming files')

for tmp_dest, mirror_file_name in rename_files:
(chan_path / tmp_dest).rename(chan_path / mirror_file_name)

logging.info(f' - Removing useless files')

for file_path in chan_path.iterdir():
file_name = file_path.name

if file_name not in keep_files:
logging.info(f' - {file_name}')
file_path.unlink()

logging.info(f' - Writing SHA256SUMS')

with (chan_path / 'SHA256SUMS').open('w') as f:
for mirror_file_name, _upstream_file_name, file_hash in image_files:
f.write(f'{file_hash} *{mirror_file_name}\n')

logging.info(f' - Update finished')
atomic_write_file(chan_path / '.last-url', chan_location)

if __name__ == "__main__":
clone_images()