tuna · z4yx · Mar 27, 2019 · Feb 15, 2019 · Feb 17, 2019 · Feb 18, 2019
diff --git a/nixos-images.py b/nixos-images.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+import hashlib
+import logging
+import lzma
+import os
+import re
+import sys
+import requests
+import subprocess
+
+from pyquery import PyQuery as pq
+from datetime import datetime, timedelta
+from pathlib import Path
+
+from urllib3.util.retry import Retry
+
+UPSTREAM_URL = os.getenv('TUNASYNC_UPSTREAM_URL', 'https://nixos.org/channels')
+WORKING_DIR = os.getenv('TUNASYNC_WORKING_DIR', 'working-images')
+CLONE_SINCE = datetime(2018, 12, 1)
+TIMEOUT = 60
+
+working_dir = Path(WORKING_DIR)
+
+session = requests.Session()
+retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
+retry_adapter = requests.adapters.HTTPAdapter(max_retries=retries)
+session.mount('http://', retry_adapter)
+session.mount('https://', retry_adapter)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='[%(asctime)s] %(levelname)-8s %(message)s'
+)
+
+def http_head(*args, **kwargs):
+    return session.head(*args, timeout=TIMEOUT, **kwargs)
+
+def http_get(*args, **kwargs):
+    return session.get(*args, timeout=TIMEOUT, **kwargs)
+
+def file_sha256(dest):
+    sha = subprocess.check_output(
+        [ 'sha256sum', str(dest) ],
+        universal_newlines=True
+    )
+    return sha.split(' ')[0]
+
+def atomic_write_file(dest, contents):
+    tmp_dest = dest.parent / f'.{dest.name}.tmp'
+    with tmp_dest.open('w') as f:
+        f.write(contents)
+    tmp_dest.rename(dest)
+
+class WrongSize(RuntimeError):
+    def __init__(self, expected, actual):
+        super().__init__(f'Wrong file size: expected {expected}, actual {actual}')
+        self.actual = actual
+        self.expected = expected
+
+def download(url, dest):
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    download_dest = dest.parent / f'.{dest.name}.tmp'
+
+    retry = retries
+
+    while True:
+        with http_get(url, stream=True) as res:
+            res.raise_for_status()
+            try:
+                with download_dest.open('wb') as f:
+                    for chunk in res.iter_content(chunk_size=64 * 1024 * 1024):
+                        if chunk:
+                            f.write(chunk)
+                actual_size = download_dest.stat().st_size
+                if 'Content-Length' in res.headers:
+                    expected_size = int(res.headers['Content-Length'])
+                    if actual_size != expected_size:
+                        raise WrongSize(expected=expected_size, actual=actual_size)
+
+                break
+            except (requests.exceptions.ConnectionError, WrongSize) as e:
+                logging.warn(e)
+                next_retry = retry.increment(
+                    method='GET',
+                    url=url,
+                    error=e
+                )
+                if next_retry is None:
+                    raise e
+                else:
+                    retry = next_retry
+                    logging.warn(f'Retrying download: {retry}')
+
+    download_dest.rename(dest)
+
+def get_links(url):
+    r = http_get(url)
+    r.raise_for_status()
+
+    node = pq(r.content)
+
+    links = []
+    for row in node('tr'):
+        td = pq(row)('td')
+        if len(td) != 5:
+            continue
+
+        link_target = td[1].find('a').get('href')
+        if link_target.startswith('/'):
+            # Link to parent directory
+            continue
+
+        last_updated = td[2].text.strip()
+
+        links.append((link_target, last_updated))
+
+    return links
+
+def get_channel(chan_location):
+    release_res = http_get(chan_location)
+    release_res.raise_for_status()
+
+    node = pq(release_res.text)
+
+    tagline = node('p').text()
+
+    tagline_res = re.match(r'^Released on (.+) from', tagline)
+
+    assert tagline_res is not None
+
+    released_time = tagline_res[1]
+
+    files = []
+
+    for row in node('tr'):
+        td = pq(row)('td')
+        if len(td) != 3:
+            continue
+        file_name, file_size, file_hash = (pq(x).text() for x in td)
+        files.append((file_name, file_size, file_hash))
+
+    return {
+        'released_time': released_time,
+        'files': files
+    }
+
+def clone_images():
+    for channel, chan_updated in get_links(f'{UPSTREAM_URL}/'):
+        if not channel.startswith('nixos-') \
+            or channel.endswith('-small') \
+            or channel == 'nixos-unstable':
+            continue
+
+        if datetime.strptime(chan_updated, '%Y-%m-%d %H:%M') < CLONE_SINCE:
+            continue
+
+        chan_path = working_dir / channel
+        chan_path.mkdir(parents=True, exist_ok=True)
+
+        res = http_head(f'{UPSTREAM_URL}/{channel}', allow_redirects=False)
+        res.raise_for_status()
+
+        chan_location = res.headers['Location']
+        chan_release_basename = chan_location.split('/')[-1]
+
+        try:
+            last_url = (chan_path / '.last-url').read_text()
+        except (IOError, OSError):
+            last_url = 'not available'
+
+        if chan_location == last_url:
+            continue
+
+        logging.info(f'- {channel} -> {chan_release_basename}')
+
+        # Matches nixos-19.03 -> nixos-19.03beta171840.23fd1394dc6
+        #                        ^-------------^
+        if chan_release_basename.startswith(channel + 'beta'):
+            logging.info(f'  - Beta channel, not updating')
+            continue
+
+        chan_info = get_channel(chan_location)
+
+        atomic_write_file(chan_path / '.released-time', chan_info['released_time'])
+
+        has_hash_fail = False
+
+        keep_files = { '.last-url', '.released-time' }
+        rename_files = []
+
+        logging.info(f'  - Downloading new files')
+
+        chan_version = channel.split('-', 1)[1]
+
+        chan_release_version = chan_release_basename.split('-', 1)[1]
+
+        simplify_name = lambda fname: fname.replace(f'-{chan_release_version}-', f'-{chan_version}-')
+
+        image_files = [
+            (simplify_name(file_name), file_name, file_hash)
+            for file_name, _file_size, file_hash in chan_info['files']
+            if file_name.endswith('.iso') or file_name.endswith('ova')
+        ]
+
+        for mirror_file_name, upstream_file_name, file_hash in image_files: 
+            keep_files.add(mirror_file_name)
+            logging.info(f'    - {upstream_file_name} -> {mirror_file_name}')
+            tmp_dest = f'.update.{upstream_file_name}'
+            rename_files.append((tmp_dest, mirror_file_name))
+
+            download(f'{chan_location}/{upstream_file_name}', chan_path / tmp_dest)
+            actual_hash = file_sha256(chan_path / tmp_dest)
+
+            if file_hash != actual_hash:
+                has_hash_fail = True
+                logging.error(f'      - Incorrect hash')
+                logging.error(f'        actual   {actual_hash}')
+                logging.error(f'        expected {file_hash}')
+                logging.info(f'      - File saved as {tmp_dest}')
+
+        if has_hash_fail:
+            logging.warn(f'  - Found bad files. Will retry next time.')
+        else:
+            logging.info(f'  - Renaming files')
+
+            for tmp_dest, mirror_file_name in rename_files:
+                (chan_path / tmp_dest).rename(chan_path / mirror_file_name)
+
+            logging.info(f'  - Removing useless files')
+
+            for file_path in chan_path.iterdir():
+                file_name = file_path.name
+
+                if file_name not in keep_files:
+                    logging.info(f'    - {file_name}')
+                    file_path.unlink()
+
+            logging.info(f'  - Writing SHA256SUMS')
+
+            with (chan_path / 'SHA256SUMS').open('w') as f:
+                for mirror_file_name, _upstream_file_name, file_hash in image_files:
+                    f.write(f'{file_hash} *{mirror_file_name}\n')
+
+            logging.info(f'  - Update finished')
+            atomic_write_file(chan_path / '.last-url', chan_location)
+
+if __name__ == "__main__":
+    clone_images()